contrib/llvm/lib/Target/X86/X86ISelLowering.cpp

   1
   2 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   3 //
   4 //                     The LLVM Compiler Infrastructure
   5 //
   6 // This file is distributed under the University of Illinois Open Source
   7 // License. See LICENSE.TXT for details.
   8 //
   9 //===----------------------------------------------------------------------===//
  10 //
  11 // This file defines the interfaces that X86 uses to lower LLVM code into a
  12 // selection DAG.
  13 //
  14 //===----------------------------------------------------------------------===//
  15
  16 #include "X86ISelLowering.h"
  17 #include "Utils/X86ShuffleDecode.h"
  18 #include "X86CallingConv.h"
  19 #include "X86FrameLowering.h"
  20 #include "X86InstrBuilder.h"
  21 #include "X86IntrinsicsInfo.h"
  22 #include "X86MachineFunctionInfo.h"
  23 #include "X86ShuffleDecodeConstantPool.h"
  24 #include "X86TargetMachine.h"
  25 #include "X86TargetObjectFile.h"
  26 #include "llvm/ADT/SmallBitVector.h"
  27 #include "llvm/ADT/SmallSet.h"
  28 #include "llvm/ADT/Statistic.h"
  29 #include "llvm/ADT/StringExtras.h"
  30 #include "llvm/ADT/StringSwitch.h"
  31 #include "llvm/Analysis/EHPersonalities.h"
  32 #include "llvm/CodeGen/IntrinsicLowering.h"
  33 #include "llvm/CodeGen/MachineFrameInfo.h"
  34 #include "llvm/CodeGen/MachineFunction.h"
  35 #include "llvm/CodeGen/MachineInstrBuilder.h"
  36 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  37 #include "llvm/CodeGen/MachineModuleInfo.h"
  38 #include "llvm/CodeGen/MachineRegisterInfo.h"
  39 #include "llvm/CodeGen/WinEHFuncInfo.h"
  40 #include "llvm/IR/CallSite.h"
  41 #include "llvm/IR/CallingConv.h"
  42 #include "llvm/IR/Constants.h"
  43 #include "llvm/IR/DerivedTypes.h"
  44 #include "llvm/IR/DiagnosticInfo.h"
  45 #include "llvm/IR/Function.h"
  46 #include "llvm/IR/GlobalAlias.h"
  47 #include "llvm/IR/GlobalVariable.h"
  48 #include "llvm/IR/Instructions.h"
  49 #include "llvm/IR/Intrinsics.h"
  50 #include "llvm/MC/MCAsmInfo.h"
  51 #include "llvm/MC/MCContext.h"
  52 #include "llvm/MC/MCExpr.h"
  53 #include "llvm/MC/MCSymbol.h"
  54 #include "llvm/Support/CommandLine.h"
  55 #include "llvm/Support/Debug.h"
  56 #include "llvm/Support/ErrorHandling.h"
  57 #include "llvm/Support/KnownBits.h"
  58 #include "llvm/Support/MathExtras.h"
  59 #include "llvm/Target/TargetLowering.h"
  60 #include "llvm/Target/TargetOptions.h"
  61 #include <algorithm>
  62 #include <bitset>
  63 #include <cctype>
  64 #include <numeric>
  65 using namespace llvm;
  66
  67 #define DEBUG_TYPE "x86-isel"
  68
  69 STATISTIC(NumTailCalls, "Number of tail calls");
  70
  71 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  72     "x86-experimental-vector-widening-legalization", cl::init(false),
  73     cl::desc("Enable an experimental vector type legalization through widening "
  74              "rather than promotion."),
  75     cl::Hidden);
  76
  77 static cl::opt<int> ExperimentalPrefLoopAlignment(
  78     "x86-experimental-pref-loop-alignment", cl::init(4),
  79     cl::desc("Sets the preferable loop alignment for experiments "
  80              "(the last x86-experimental-pref-loop-alignment bits"
  81              " of the loop header PC will be 0)."),
  82     cl::Hidden);
  83
  84 /// Call this when the user attempts to do something unsupported, like
  85 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
  86 /// report_fatal_error, so calling code should attempt to recover without
  87 /// crashing.
  88 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
  89                              const char *Msg) {
  90   MachineFunction &MF = DAG.getMachineFunction();
  91   DAG.getContext()->diagnose(
  92       DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
  93 }
  94
  95 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  96                                      const X86Subtarget &STI)
  97     : TargetLowering(TM), Subtarget(STI) {
  98   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
  99   X86ScalarSSEf64 = Subtarget.hasSSE2();
 100   X86ScalarSSEf32 = Subtarget.hasSSE1();
 101   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 102
 103   // Set up the TargetLowering object.
 104
 105   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 106   setBooleanContents(ZeroOrOneBooleanContent);
 107   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 108   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 109
 110   // For 64-bit, since we have so many registers, use the ILP scheduler.
 111   // For 32-bit, use the register pressure specific scheduling.
 112   // For Atom, always use ILP scheduling.
 113   if (Subtarget.isAtom())
 114     setSchedulingPreference(Sched::ILP);
 115   else if (Subtarget.is64Bit())
 116     setSchedulingPreference(Sched::ILP);
 117   else
 118     setSchedulingPreference(Sched::RegPressure);
 119   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 120   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 121
 122   // Bypass expensive divides and use cheaper ones.
 123   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 124     if (Subtarget.hasSlowDivide32())
 125       addBypassSlowDiv(32, 8);
 126     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
 127       addBypassSlowDiv(64, 32);
 128   }
 129
 130   if (Subtarget.isTargetKnownWindowsMSVC() ||
 131       Subtarget.isTargetWindowsItanium()) {
 132     // Setup Windows compiler runtime calls.
 133     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 134     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 135     setLibcallName(RTLIB::SREM_I64, "_allrem");
 136     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 137     setLibcallName(RTLIB::MUL_I64, "_allmul");
 138     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 139     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 140     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 141     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 142     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 143   }
 144
 145   if (Subtarget.isTargetDarwin()) {
 146     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 147     setUseUnderscoreSetJmp(false);
 148     setUseUnderscoreLongJmp(false);
 149   } else if (Subtarget.isTargetWindowsGNU()) {
 150     // MS runtime is weird: it exports _setjmp, but longjmp!
 151     setUseUnderscoreSetJmp(true);
 152     setUseUnderscoreLongJmp(false);
 153   } else {
 154     setUseUnderscoreSetJmp(true);
 155     setUseUnderscoreLongJmp(true);
 156   }
 157
 158   // Set up the register classes.
 159   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 160   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 161   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 162   if (Subtarget.is64Bit())
 163     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 164
 165   for (MVT VT : MVT::integer_valuetypes())
 166     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 167
 168   // We don't accept any truncstore of integer registers.
 169   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 170   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 171   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 172   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 173   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 174   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 175
 176   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 177
 178   // SETOEQ and SETUNE require checking two conditions.
 179   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 180   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 181   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 182   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 183   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 184   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 185
 186   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 187   // operation.
 188   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 189   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 190   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 191
 192   if (Subtarget.is64Bit()) {
 193     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
 194       // f32/f64 are legal, f80 is custom.
 195       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
 196     else
 197       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
 198     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 199   } else if (!Subtarget.useSoftFloat()) {
 200     // We have an algorithm for SSE2->double, and we turn this into a
 201     // 64-bit FILD followed by conditional FADD for other targets.
 202     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 203     // We have an algorithm for SSE2, and we turn this into a 64-bit
 204     // FILD or VCVTUSI2SS/SD for other targets.
 205     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 206   }
 207
 208   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 209   // this operation.
 210   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 211   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 212
 213   if (!Subtarget.useSoftFloat()) {
 214     // SSE has no i16 to fp conversion, only i32.
 215     if (X86ScalarSSEf32) {
 216       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 217       // f32 and f64 cases are Legal, f80 case is not
 218       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 219     } else {
 220       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 221       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 222     }
 223   } else {
 224     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 225     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 226   }
 227
 228   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 229   // this operation.
 230   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 231   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 232
 233   if (!Subtarget.useSoftFloat()) {
 234     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 235     // are Legal, f80 is custom lowered.
 236     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 237     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 238
 239     if (X86ScalarSSEf32) {
 240       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 241       // f32 and f64 cases are Legal, f80 case is not
 242       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 243     } else {
 244       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 245       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 246     }
 247   } else {
 248     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 249     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
 250     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
 251   }
 252
 253   // Handle FP_TO_UINT by promoting the destination to a larger signed
 254   // conversion.
 255   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 256   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 257   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 258
 259   if (Subtarget.is64Bit()) {
 260     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
 261       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
 262       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 263       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
 264     } else {
 265       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
 266       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
 267     }
 268   } else if (!Subtarget.useSoftFloat()) {
 269     // Since AVX is a superset of SSE3, only check for SSE here.
 270     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
 271       // Expand FP_TO_UINT into a select.
 272       // FIXME: We would like to use a Custom expander here eventually to do
 273       // the optimal thing for SSE vs. the default expansion in the legalizer.
 274       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 275     else
 276       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
 277       // With SSE3 we can use fisttpll to convert to a signed i64; without
 278       // SSE, we're stuck with a fistpll.
 279       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 280
 281     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 282   }
 283
 284   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 285   if (!X86ScalarSSEf64) {
 286     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 287     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 288     if (Subtarget.is64Bit()) {
 289       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 290       // Without SSE, i64->f64 goes through memory.
 291       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 292     }
 293   } else if (!Subtarget.is64Bit())
 294     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 295
 296   // Scalar integer divide and remainder are lowered to use operations that
 297   // produce two results, to match the available instructions. This exposes
 298   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 299   // into a single instruction.
 300   //
 301   // Scalar integer multiply-high is also lowered to use two-result
 302   // operations, to match the available instructions. However, plain multiply
 303   // (low) operations are left as Legal, as there are single-result
 304   // instructions for this in x86. Using the two-result multiply instructions
 305   // when both high and low results are needed must be arranged by dagcombine.
 306   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 307     setOperationAction(ISD::MULHS, VT, Expand);
 308     setOperationAction(ISD::MULHU, VT, Expand);
 309     setOperationAction(ISD::SDIV, VT, Expand);
 310     setOperationAction(ISD::UDIV, VT, Expand);
 311     setOperationAction(ISD::SREM, VT, Expand);
 312     setOperationAction(ISD::UREM, VT, Expand);
 313   }
 314
 315   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 316   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 317   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
 318                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
 319     setOperationAction(ISD::BR_CC,     VT, Expand);
 320     setOperationAction(ISD::SELECT_CC, VT, Expand);
 321   }
 322   if (Subtarget.is64Bit())
 323     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 324   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 325   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 326   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 327   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 328
 329   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 330   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 331   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 332   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 333
 334   // Promote the i8 variants and force them on up to i32 which has a shorter
 335   // encoding.
 336   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
 337   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 338   if (!Subtarget.hasBMI()) {
 339     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 340     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 341     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
 342     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
 343     if (Subtarget.is64Bit()) {
 344       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 345       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
 346     }
 347   }
 348
 349   if (Subtarget.hasLZCNT()) {
 350     // When promoting the i8 variants, force them to i32 for a shorter
 351     // encoding.
 352     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
 353     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 354   } else {
 355     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 356     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 357     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 358     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 359     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 360     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 361     if (Subtarget.is64Bit()) {
 362       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 363       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 364     }
 365   }
 366
 367   // Special handling for half-precision floating point conversions.
 368   // If we don't have F16C support, then lower half float conversions
 369   // into library calls.
 370   if (Subtarget.useSoftFloat() ||
 371       (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
 372     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 373     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 374   }
 375
 376   // There's never any support for operations beyond MVT::f32.
 377   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 378   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 379   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 380   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 381
 382   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 383   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 384   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 385   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 386   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 387   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 388
 389   if (Subtarget.hasPOPCNT()) {
 390     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 391   } else {
 392     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 393     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 394     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 395     if (Subtarget.is64Bit())
 396       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 397   }
 398
 399   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 400
 401   if (!Subtarget.hasMOVBE())
 402     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 403
 404   // These should be promoted to a larger select which is supported.
 405   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 406   // X86 wants to expand cmov itself.
 407   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
 408     setOperationAction(ISD::SELECT, VT, Custom);
 409     setOperationAction(ISD::SETCC, VT, Custom);
 410   }
 411   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 412     if (VT == MVT::i64 && !Subtarget.is64Bit())
 413       continue;
 414     setOperationAction(ISD::SELECT, VT, Custom);
 415     setOperationAction(ISD::SETCC,  VT, Custom);
 416   }
 417   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 418   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 419   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 420   // support continuation, user-level threading, and etc.. As a result, no
 421   // other SjLj exception interfaces are implemented and please don't build
 422   // your own exception handling based on them.
 423   // LLVM/Clang supports zero-cost DWARF exception handling.
 424   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 425   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 426   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 427   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
 428     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 429
 430   // Darwin ABI issue.
 431   for (auto VT : { MVT::i32, MVT::i64 }) {
 432     if (VT == MVT::i64 && !Subtarget.is64Bit())
 433       continue;
 434     setOperationAction(ISD::ConstantPool    , VT, Custom);
 435     setOperationAction(ISD::JumpTable       , VT, Custom);
 436     setOperationAction(ISD::GlobalAddress   , VT, Custom);
 437     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
 438     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
 439     setOperationAction(ISD::BlockAddress    , VT, Custom);
 440   }
 441
 442   // 64-bit shl, sra, srl (iff 32-bit x86)
 443   for (auto VT : { MVT::i32, MVT::i64 }) {
 444     if (VT == MVT::i64 && !Subtarget.is64Bit())
 445       continue;
 446     setOperationAction(ISD::SHL_PARTS, VT, Custom);
 447     setOperationAction(ISD::SRA_PARTS, VT, Custom);
 448     setOperationAction(ISD::SRL_PARTS, VT, Custom);
 449   }
 450
 451   if (Subtarget.hasSSE1())
 452     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 453
 454   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 455
 456   // Expand certain atomics
 457   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 458     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 459     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 460     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
 461     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
 462     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
 463     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
 464     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 465   }
 466
 467   if (Subtarget.hasCmpxchg16b()) {
 468     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 469   }
 470
 471   // FIXME - use subtarget debug flags
 472   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
 473       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
 474       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
 475     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 476   }
 477
 478   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 479   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 480
 481   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 482   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 483
 484   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 485   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 486
 487   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 488   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 489   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 490   bool Is64Bit = Subtarget.is64Bit();
 491   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
 492   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 493
 494   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 495   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 496
 497   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 498
 499   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 500   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 501   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 502
 503   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
 504     // f32 and f64 use SSE.
 505     // Set up the FP register classes.
 506     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 507                                                      : &X86::FR32RegClass);
 508     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
 509                                                      : &X86::FR64RegClass);
 510
 511     for (auto VT : { MVT::f32, MVT::f64 }) {
 512       // Use ANDPD to simulate FABS.
 513       setOperationAction(ISD::FABS, VT, Custom);
 514
 515       // Use XORP to simulate FNEG.
 516       setOperationAction(ISD::FNEG, VT, Custom);
 517
 518       // Use ANDPD and ORPD to simulate FCOPYSIGN.
 519       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 520
 521       // We don't support sin/cos/fmod
 522       setOperationAction(ISD::FSIN   , VT, Expand);
 523       setOperationAction(ISD::FCOS   , VT, Expand);
 524       setOperationAction(ISD::FSINCOS, VT, Expand);
 525     }
 526
 527     // Lower this to MOVMSK plus an AND.
 528     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 529     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 530
 531     // Expand FP immediates into loads from the stack, except for the special
 532     // cases we handle.
 533     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 534     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 535   } else if (UseX87 && X86ScalarSSEf32) {
 536     // Use SSE for f32, x87 for f64.
 537     // Set up the FP register classes.
 538     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 539                                                      : &X86::FR32RegClass);
 540     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 541
 542     // Use ANDPS to simulate FABS.
 543     setOperationAction(ISD::FABS , MVT::f32, Custom);
 544
 545     // Use XORP to simulate FNEG.
 546     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 547
 548     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 549
 550     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 551     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 552     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 553
 554     // We don't support sin/cos/fmod
 555     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 556     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 557     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 558
 559     // Special cases we handle for FP constants.
 560     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 561     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 562     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 563     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 564     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 565
 566     if (!TM.Options.UnsafeFPMath) {
 567       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 568       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 569       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 570     }
 571   } else if (UseX87) {
 572     // f32 and f64 in x87.
 573     // Set up the FP register classes.
 574     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 575     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 576
 577     for (auto VT : { MVT::f32, MVT::f64 }) {
 578       setOperationAction(ISD::UNDEF,     VT, Expand);
 579       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 580
 581       if (!TM.Options.UnsafeFPMath) {
 582         setOperationAction(ISD::FSIN   , VT, Expand);
 583         setOperationAction(ISD::FCOS   , VT, Expand);
 584         setOperationAction(ISD::FSINCOS, VT, Expand);
 585       }
 586     }
 587     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 588     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 589     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 590     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 591     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 592     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 593     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 594     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 595   }
 596
 597   // We don't support FMA.
 598   setOperationAction(ISD::FMA, MVT::f64, Expand);
 599   setOperationAction(ISD::FMA, MVT::f32, Expand);
 600
 601   // Long double always uses X87, except f128 in MMX.
 602   if (UseX87) {
 603     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
 604       addRegisterClass(MVT::f128, &X86::FR128RegClass);
 605       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
 606       setOperationAction(ISD::FABS , MVT::f128, Custom);
 607       setOperationAction(ISD::FNEG , MVT::f128, Custom);
 608       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 609     }
 610
 611     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 612     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 613     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 614     {
 615       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
 616       addLegalFPImmediate(TmpFlt);  // FLD0
 617       TmpFlt.changeSign();
 618       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 619
 620       bool ignored;
 621       APFloat TmpFlt2(+1.0);
 622       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
 623                       &ignored);
 624       addLegalFPImmediate(TmpFlt2);  // FLD1
 625       TmpFlt2.changeSign();
 626       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 627     }
 628
 629     if (!TM.Options.UnsafeFPMath) {
 630       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 631       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 632       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 633     }
 634
 635     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 636     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 637     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 638     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 639     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 640     setOperationAction(ISD::FMA, MVT::f80, Expand);
 641   }
 642
 643   // Always use a library call for pow.
 644   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 645   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 646   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 647
 648   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 649   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 650   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 651   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 652   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 653   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 654   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 655
 656   // Some FP actions are always expanded for vector types.
 657   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
 658                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
 659     setOperationAction(ISD::FSIN,      VT, Expand);
 660     setOperationAction(ISD::FSINCOS,   VT, Expand);
 661     setOperationAction(ISD::FCOS,      VT, Expand);
 662     setOperationAction(ISD::FREM,      VT, Expand);
 663     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 664     setOperationAction(ISD::FPOW,      VT, Expand);
 665     setOperationAction(ISD::FLOG,      VT, Expand);
 666     setOperationAction(ISD::FLOG2,     VT, Expand);
 667     setOperationAction(ISD::FLOG10,    VT, Expand);
 668     setOperationAction(ISD::FEXP,      VT, Expand);
 669     setOperationAction(ISD::FEXP2,     VT, Expand);
 670   }
 671
 672   // First set operation action for all vector types to either promote
 673   // (for widening) or expand (for scalarization). Then we will selectively
 674   // turn on ones that can be effectively codegen'd.
 675   for (MVT VT : MVT::vector_valuetypes()) {
 676     setOperationAction(ISD::SDIV, VT, Expand);
 677     setOperationAction(ISD::UDIV, VT, Expand);
 678     setOperationAction(ISD::SREM, VT, Expand);
 679     setOperationAction(ISD::UREM, VT, Expand);
 680     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 681     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 682     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 683     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 684     setOperationAction(ISD::FMA,  VT, Expand);
 685     setOperationAction(ISD::FFLOOR, VT, Expand);
 686     setOperationAction(ISD::FCEIL, VT, Expand);
 687     setOperationAction(ISD::FTRUNC, VT, Expand);
 688     setOperationAction(ISD::FRINT, VT, Expand);
 689     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 690     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 691     setOperationAction(ISD::MULHS, VT, Expand);
 692     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 693     setOperationAction(ISD::MULHU, VT, Expand);
 694     setOperationAction(ISD::SDIVREM, VT, Expand);
 695     setOperationAction(ISD::UDIVREM, VT, Expand);
 696     setOperationAction(ISD::CTPOP, VT, Expand);
 697     setOperationAction(ISD::CTTZ, VT, Expand);
 698     setOperationAction(ISD::CTLZ, VT, Expand);
 699     setOperationAction(ISD::ROTL, VT, Expand);
 700     setOperationAction(ISD::ROTR, VT, Expand);
 701     setOperationAction(ISD::BSWAP, VT, Expand);
 702     setOperationAction(ISD::SETCC, VT, Expand);
 703     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 704     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 705     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 706     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 707     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 708     setOperationAction(ISD::TRUNCATE, VT, Expand);
 709     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 710     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 711     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 712     setOperationAction(ISD::SELECT_CC, VT, Expand);
 713     for (MVT InnerVT : MVT::vector_valuetypes()) {
 714       setTruncStoreAction(InnerVT, VT, Expand);
 715
 716       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 717       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 718
 719       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 720       // types, we have to deal with them whether we ask for Expansion or not.
 721       // Setting Expand causes its own optimisation problems though, so leave
 722       // them legal.
 723       if (VT.getVectorElementType() == MVT::i1)
 724         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 725
 726       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 727       // split/scalarized right now.
 728       if (VT.getVectorElementType() == MVT::f16)
 729         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 730     }
 731   }
 732
 733   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 734   // with -msoft-float, disable use of MMX as well.
 735   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
 736     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 737     // No operations on x86mmx supported, everything uses intrinsics.
 738   }
 739
 740   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
 741     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 742                                                     : &X86::VR128RegClass);
 743
 744     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 745     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 746     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
 747     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 748     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 749     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
 750     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 751     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 752     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 753   }
 754
 755   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
 756     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
 757                                                     : &X86::VR128RegClass);
 758
 759     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 760     // registers cannot be used even for integer operations.
 761     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
 762                                                     : &X86::VR128RegClass);
 763     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
 764                                                     : &X86::VR128RegClass);
 765     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 766                                                     : &X86::VR128RegClass);
 767     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
 768                                                     : &X86::VR128RegClass);
 769
 770     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
 771     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 772     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 773     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 774     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 775     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
 776     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
 777     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 778     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 779     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 780     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 781     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 782     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 783
 784     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
 785     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
 786     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
 787     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
 788
 789     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 790     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 791     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 792
 793     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 794       setOperationAction(ISD::SETCC,              VT, Custom);
 795       setOperationAction(ISD::CTPOP,              VT, Custom);
 796       setOperationAction(ISD::CTTZ,               VT, Custom);
 797     }
 798
 799     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 800       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
 801       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 802       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 803       setOperationAction(ISD::VSELECT,            VT, Custom);
 804       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 805     }
 806
 807     // We support custom legalizing of sext and anyext loads for specific
 808     // memory vector types which we can load as a scalar (or sequence of
 809     // scalars) and extend in-register to a legal 128-bit vector type. For sext
 810     // loads these must work with a single scalar load.
 811     for (MVT VT : MVT::integer_vector_valuetypes()) {
 812       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
 813       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
 814       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
 815       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
 816       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
 817       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
 818       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
 819       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
 820       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
 821     }
 822
 823     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
 824       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 825       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 826       setOperationAction(ISD::VSELECT,            VT, Custom);
 827
 828       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
 829         continue;
 830
 831       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
 832       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 833     }
 834
 835     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
 836     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 837       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
 838       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
 839       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
 840       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
 841       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
 842     }
 843
 844     // Custom lower v2i64 and v2f64 selects.
 845     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
 846     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 847
 848     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
 849     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
 850
 851     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 852     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 853
 854     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
 855     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
 856     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
 857
 858     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
 859     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
 860
 861     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
 862     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 863
 864     for (MVT VT : MVT::fp_vector_valuetypes())
 865       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 866
 867     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
 868     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
 869     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
 870
 871     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
 872     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
 873     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 874
 875     // In the customized shift lowering, the legal v4i32/v2i64 cases
 876     // in AVX2 will be recognized.
 877     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 878       setOperationAction(ISD::SRL,              VT, Custom);
 879       setOperationAction(ISD::SHL,              VT, Custom);
 880       setOperationAction(ISD::SRA,              VT, Custom);
 881     }
 882   }
 883
 884   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
 885     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
 886     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
 887     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
 888     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
 889     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
 890     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
 891     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
 892     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
 893   }
 894
 895   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
 896     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
 897       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
 898       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
 899       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
 900       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
 901       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
 902     }
 903
 904     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
 905     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
 906     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
 907     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
 908     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
 909     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
 910     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
 911     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 912
 913     // FIXME: Do we need to handle scalar-to-vector here?
 914     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 915
 916     // We directly match byte blends in the backend as they match the VSELECT
 917     // condition form.
 918     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 919
 920     // SSE41 brings specific instructions for doing vector sign extend even in
 921     // cases where we don't have SRA.
 922     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 923       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
 924       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
 925     }
 926
 927     for (MVT VT : MVT::integer_vector_valuetypes()) {
 928       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
 929       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
 930       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
 931     }
 932
 933     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
 934     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
 935       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
 936       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
 937       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
 938       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
 939       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
 940       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
 941     }
 942
 943     // i8 vectors are custom because the source register and source
 944     // source memory operand types are not the same width.
 945     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
 946   }
 947
 948   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
 949     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 950                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 951       setOperationAction(ISD::ROTL, VT, Custom);
 952
 953     // XOP can efficiently perform BITREVERSE with VPPERM.
 954     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
 955       setOperationAction(ISD::BITREVERSE, VT, Custom);
 956
 957     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 958                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 959       setOperationAction(ISD::BITREVERSE, VT, Custom);
 960   }
 961
 962   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
 963     bool HasInt256 = Subtarget.hasInt256();
 964
 965     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 966                                                      : &X86::VR256RegClass);
 967     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
 968                                                      : &X86::VR256RegClass);
 969     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 970                                                      : &X86::VR256RegClass);
 971     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 972                                                      : &X86::VR256RegClass);
 973     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 974                                                      : &X86::VR256RegClass);
 975     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 976                                                      : &X86::VR256RegClass);
 977
 978     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
 979       setOperationAction(ISD::FFLOOR,     VT, Legal);
 980       setOperationAction(ISD::FCEIL,      VT, Legal);
 981       setOperationAction(ISD::FTRUNC,     VT, Legal);
 982       setOperationAction(ISD::FRINT,      VT, Legal);
 983       setOperationAction(ISD::FNEARBYINT, VT, Legal);
 984       setOperationAction(ISD::FNEG,       VT, Custom);
 985       setOperationAction(ISD::FABS,       VT, Custom);
 986       setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
 987     }
 988
 989     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
 990     // even though v8i16 is a legal type.
 991     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
 992     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
 993     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
 994
 995     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
 996     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
 997     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 998
 999     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1000     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1001
1002     for (MVT VT : MVT::fp_vector_valuetypes())
1003       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1004
1005     // In the customized shift lowering, the legal v8i32/v4i64 cases
1006     // in AVX2 will be recognized.
1007     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1008       setOperationAction(ISD::SRL, VT, Custom);
1009       setOperationAction(ISD::SHL, VT, Custom);
1010       setOperationAction(ISD::SRA, VT, Custom);
1011     }
1012
1013     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1014     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1015     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1016
1017     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1018       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1019       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1020       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1021     }
1022
1023     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1024     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1025     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1026     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1027
1028     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1029       setOperationAction(ISD::SETCC,           VT, Custom);
1030       setOperationAction(ISD::CTPOP,           VT, Custom);
1031       setOperationAction(ISD::CTTZ,            VT, Custom);
1032       setOperationAction(ISD::CTLZ,            VT, Custom);
1033     }
1034
1035     if (Subtarget.hasAnyFMA()) {
1036       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1037                        MVT::v2f64, MVT::v4f64 })
1038         setOperationAction(ISD::FMA, VT, Legal);
1039     }
1040
1041     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1042       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1043       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1044     }
1045
1046     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1047     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1048     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1049     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1050
1051     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1052     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1053
1054     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1055     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1056     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1057     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1058
1059     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1060       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1061       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1062       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1063       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1064       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1065     }
1066
1067     if (HasInt256) {
1068       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1069       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1070       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1071
1072       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1073       // when we have a 256bit-wide blend with immediate.
1074       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1075
1076       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1077       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1078         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1079         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1080         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1081         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1082         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1083         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1084       }
1085     }
1086
1087     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1088                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1089       setOperationAction(ISD::MLOAD,  VT, Legal);
1090       setOperationAction(ISD::MSTORE, VT, Legal);
1091     }
1092
1093     // Extract subvector is special because the value type
1094     // (result) is 128-bit but the source is 256-bit wide.
1095     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1096                      MVT::v4f32, MVT::v2f64 }) {
1097       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1098     }
1099
1100     // Custom lower several nodes for 256-bit types.
1101     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1102                     MVT::v8f32, MVT::v4f64 }) {
1103       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1104       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1105       setOperationAction(ISD::VSELECT,            VT, Custom);
1106       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1107       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1108       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1109       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1110       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1111     }
1112
1113     if (HasInt256)
1114       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1115
1116     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1117     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1118       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1119       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1120       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1121       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1122       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1123     }
1124   }
1125
1126   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1127     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1128     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1129     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1130     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1131
1132     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1133     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1134     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1135
1136     for (MVT VT : MVT::fp_vector_valuetypes())
1137       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1138
1139     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1140       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1141       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1142       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1143       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1144       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1145       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1146     }
1147
1148     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1149                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1150                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1151       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1152       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1153       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1154       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1155       setTruncStoreAction(VT, MaskVT, Custom);
1156     }
1157
1158     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1159       setOperationAction(ISD::FNEG,  VT, Custom);
1160       setOperationAction(ISD::FABS,  VT, Custom);
1161       setOperationAction(ISD::FMA,   VT, Legal);
1162       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1163     }
1164
1165     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1166     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1167     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1168     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1169     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1170     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1171     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1172     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1173     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1174     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1175     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1176     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1177     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1178     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1179     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
1181     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
1182     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
1183     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
1184     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
1186     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
1187     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
1188     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1189     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1190
1191     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1192     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1193     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1194     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1195     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1196     if (Subtarget.hasVLX()){
1197       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1198       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1199       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1200       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1201       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1202
1203       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1204       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1205       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1206       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1207       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1208     } else {
1209       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1210            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1211         setOperationAction(ISD::MLOAD,  VT, Custom);
1212         setOperationAction(ISD::MSTORE, VT, Custom);
1213       }
1214     }
1215     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1216     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1217
1218     if (Subtarget.hasDQI()) {
1219       for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1220         setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
1221         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
1222         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
1223         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
1224       }
1225       if (Subtarget.hasVLX()) {
1226         // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1227         setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
1228         setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
1229         setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
1230       }
1231     }
1232     if (Subtarget.hasVLX()) {
1233       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1234       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1235       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1236       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1237       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1238       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1239       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1240       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1241       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1242       setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
1243       setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
1244
1245       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1246       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1247       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1248       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1249       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1250       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1251       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1252       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1253       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1254       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1255       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1256     }
1257
1258     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1259     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1260     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1261     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1262     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1263     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1264     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1265     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1266     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1267     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1268
1269     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1270       setOperationAction(ISD::FFLOOR,           VT, Legal);
1271       setOperationAction(ISD::FCEIL,            VT, Legal);
1272       setOperationAction(ISD::FTRUNC,           VT, Legal);
1273       setOperationAction(ISD::FRINT,            VT, Legal);
1274       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
1275     }
1276
1277     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
1278     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1279
1280     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1281     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1282     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1283
1284     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1285     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1286     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1287     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1288     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1289
1290     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
1291
1292     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1293     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1294     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1295     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1296     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1297     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1298
1299     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1300
1301     // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1302     setOperationAction(ISD::ABS,                MVT::v4i64, Legal);
1303     setOperationAction(ISD::ABS,                MVT::v2i64, Legal);
1304
1305     for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1306       setOperationAction(ISD::ADD,              VT, Custom);
1307       setOperationAction(ISD::SUB,              VT, Custom);
1308       setOperationAction(ISD::MUL,              VT, Custom);
1309       setOperationAction(ISD::SETCC,            VT, Custom);
1310       setOperationAction(ISD::SELECT,           VT, Custom);
1311       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1312
1313       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1314       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1315       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1316       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1317       setOperationAction(ISD::VSELECT,          VT,  Expand);
1318     }
1319
1320     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1321       setOperationAction(ISD::SMAX,             VT, Legal);
1322       setOperationAction(ISD::UMAX,             VT, Legal);
1323       setOperationAction(ISD::SMIN,             VT, Legal);
1324       setOperationAction(ISD::UMIN,             VT, Legal);
1325       setOperationAction(ISD::ABS,              VT, Legal);
1326       setOperationAction(ISD::SRL,              VT, Custom);
1327       setOperationAction(ISD::SHL,              VT, Custom);
1328       setOperationAction(ISD::SRA,              VT, Custom);
1329       setOperationAction(ISD::CTPOP,            VT, Custom);
1330       setOperationAction(ISD::CTTZ,             VT, Custom);
1331     }
1332
1333     // Need to promote to 64-bit even though we have 32-bit masked instructions
1334     // because the IR optimizers rearrange bitcasts around logic ops leaving
1335     // too many variations to handle if we don't promote them.
1336     setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1337     setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
1338     setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1339
1340     if (Subtarget.hasCDI()) {
1341       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1342       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1343                       MVT::v4i64, MVT::v8i64}) {
1344         setOperationAction(ISD::CTLZ,            VT, Legal);
1345         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1346       }
1347     } // Subtarget.hasCDI()
1348
1349     if (Subtarget.hasDQI()) {
1350       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1351       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1352       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1353       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1354     }
1355
1356     if (Subtarget.hasVPOPCNTDQ()) {
1357       // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1358       // version of popcntd/q.
1359       for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1360                       MVT::v4i32, MVT::v2i64})
1361         setOperationAction(ISD::CTPOP, VT, Legal);
1362     }
1363
1364     // Custom lower several nodes.
1365     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1366                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1367       setOperationAction(ISD::MGATHER,  VT, Custom);
1368       setOperationAction(ISD::MSCATTER, VT, Custom);
1369     }
1370     // Extract subvector is special because the value type
1371     // (result) is 256-bit but the source is 512-bit wide.
1372     // 128-bit was made Custom under AVX1.
1373     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1374                      MVT::v8f32, MVT::v4f64 })
1375       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1376     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1377                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1378       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1379
1380     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1381       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1382       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1383       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1384       setOperationAction(ISD::VSELECT,             VT, Custom);
1385       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1386       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1387       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
1388       setOperationAction(ISD::MLOAD,               VT, Legal);
1389       setOperationAction(ISD::MSTORE,              VT, Legal);
1390       setOperationAction(ISD::MGATHER,             VT, Legal);
1391       setOperationAction(ISD::MSCATTER,            VT, Custom);
1392     }
1393     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1394       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
1395       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1396     }
1397   }// has  AVX-512
1398
1399   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1400     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1401     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1402
1403     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1404     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1405
1406     setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
1407     setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
1408     setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
1409     setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
1410     setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
1411     setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
1412
1413     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1414     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1415     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1416     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1417     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1418     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1419     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1420     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1421     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1422     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1423     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1424     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1425     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
1426     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
1427     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1428     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1429     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
1430     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1431     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1432     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1433     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1434     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1435     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1436     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1437     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1438     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1439     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1440     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1441     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1442     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1443     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1444     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1445     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1446     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1447     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1448     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1449     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1450     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1451     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1452     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1453     setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1454     setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1455     setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1456     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1457     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1458
1459     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1460
1461     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1462     if (Subtarget.hasVLX()) {
1463       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1464       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1465     }
1466
1467     LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1468     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1469       setOperationAction(ISD::MLOAD,               VT, Action);
1470       setOperationAction(ISD::MSTORE,              VT, Action);
1471     }
1472
1473     if (Subtarget.hasCDI()) {
1474       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1475       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1476     }
1477
1478     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1479       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1480       setOperationAction(ISD::VSELECT,      VT, Custom);
1481       setOperationAction(ISD::ABS,          VT, Legal);
1482       setOperationAction(ISD::SRL,          VT, Custom);
1483       setOperationAction(ISD::SHL,          VT, Custom);
1484       setOperationAction(ISD::SRA,          VT, Custom);
1485       setOperationAction(ISD::MLOAD,        VT, Legal);
1486       setOperationAction(ISD::MSTORE,       VT, Legal);
1487       setOperationAction(ISD::CTPOP,        VT, Custom);
1488       setOperationAction(ISD::CTTZ,         VT, Custom);
1489       setOperationAction(ISD::SMAX,         VT, Legal);
1490       setOperationAction(ISD::UMAX,         VT, Legal);
1491       setOperationAction(ISD::SMIN,         VT, Legal);
1492       setOperationAction(ISD::UMIN,         VT, Legal);
1493
1494       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1495       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1496       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1497     }
1498
1499     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1500       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1501       if (Subtarget.hasVLX()) {
1502         // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1503         setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1504         setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1505       }
1506     }
1507   }
1508
1509   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1510     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1511     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1512
1513     for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1514       setOperationAction(ISD::ADD,                VT, Custom);
1515       setOperationAction(ISD::SUB,                VT, Custom);
1516       setOperationAction(ISD::MUL,                VT, Custom);
1517       setOperationAction(ISD::VSELECT,            VT, Expand);
1518
1519       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1520       setOperationAction(ISD::SETCC,              VT, Custom);
1521       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1522       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1523       setOperationAction(ISD::SELECT,             VT, Custom);
1524       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1525       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1526     }
1527
1528     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1529     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1530     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1531     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1532
1533     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1534       setOperationAction(ISD::SMAX, VT, Legal);
1535       setOperationAction(ISD::UMAX, VT, Legal);
1536       setOperationAction(ISD::SMIN, VT, Legal);
1537       setOperationAction(ISD::UMIN, VT, Legal);
1538     }
1539   }
1540
1541   // We want to custom lower some of our intrinsics.
1542   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1543   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1544   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1545   if (!Subtarget.is64Bit()) {
1546     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1547     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1548   }
1549
1550   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1551   // handle type legalization for these operations here.
1552   //
1553   // FIXME: We really should do custom legalization for addition and
1554   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1555   // than generic legalization for 64-bit multiplication-with-overflow, though.
1556   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1557     if (VT == MVT::i64 && !Subtarget.is64Bit())
1558       continue;
1559     // Add/Sub/Mul with overflow operations are custom lowered.
1560     setOperationAction(ISD::SADDO, VT, Custom);
1561     setOperationAction(ISD::UADDO, VT, Custom);
1562     setOperationAction(ISD::SSUBO, VT, Custom);
1563     setOperationAction(ISD::USUBO, VT, Custom);
1564     setOperationAction(ISD::SMULO, VT, Custom);
1565     setOperationAction(ISD::UMULO, VT, Custom);
1566
1567     // Support carry in as value rather than glue.
1568     setOperationAction(ISD::ADDCARRY, VT, Custom);
1569     setOperationAction(ISD::SUBCARRY, VT, Custom);
1570     setOperationAction(ISD::SETCCCARRY, VT, Custom);
1571   }
1572
1573   if (!Subtarget.is64Bit()) {
1574     // These libcalls are not available in 32-bit.
1575     setLibcallName(RTLIB::SHL_I128, nullptr);
1576     setLibcallName(RTLIB::SRL_I128, nullptr);
1577     setLibcallName(RTLIB::SRA_I128, nullptr);
1578   }
1579
1580   // Combine sin / cos into one node or libcall if possible.
1581   if (Subtarget.hasSinCos()) {
1582     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1583     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1584     if (Subtarget.isTargetDarwin()) {
1585       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1586       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1587       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1588       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1589     }
1590   }
1591
1592   if (Subtarget.isTargetWin64()) {
1593     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1594     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1595     setOperationAction(ISD::SREM, MVT::i128, Custom);
1596     setOperationAction(ISD::UREM, MVT::i128, Custom);
1597     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1598     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1599   }
1600
1601   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1602   // is. We should promote the value to 64-bits to solve this.
1603   // This is what the CRT headers do - `fmodf` is an inline header
1604   // function casting to f64 and calling `fmod`.
1605   if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1606                               Subtarget.isTargetWindowsItanium()))
1607     for (ISD::NodeType Op :
1608          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1609           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1610       if (isOperationExpand(Op, MVT::f32))
1611         setOperationAction(Op, MVT::f32, Promote);
1612
1613   // We have target-specific dag combine patterns for the following nodes:
1614   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1615   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1616   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1617   setTargetDAGCombine(ISD::BITCAST);
1618   setTargetDAGCombine(ISD::VSELECT);
1619   setTargetDAGCombine(ISD::SELECT);
1620   setTargetDAGCombine(ISD::SHL);
1621   setTargetDAGCombine(ISD::SRA);
1622   setTargetDAGCombine(ISD::SRL);
1623   setTargetDAGCombine(ISD::OR);
1624   setTargetDAGCombine(ISD::AND);
1625   setTargetDAGCombine(ISD::ADD);
1626   setTargetDAGCombine(ISD::FADD);
1627   setTargetDAGCombine(ISD::FSUB);
1628   setTargetDAGCombine(ISD::FNEG);
1629   setTargetDAGCombine(ISD::FMA);
1630   setTargetDAGCombine(ISD::FMINNUM);
1631   setTargetDAGCombine(ISD::FMAXNUM);
1632   setTargetDAGCombine(ISD::SUB);
1633   setTargetDAGCombine(ISD::LOAD);
1634   setTargetDAGCombine(ISD::MLOAD);
1635   setTargetDAGCombine(ISD::STORE);
1636   setTargetDAGCombine(ISD::MSTORE);
1637   setTargetDAGCombine(ISD::TRUNCATE);
1638   setTargetDAGCombine(ISD::ZERO_EXTEND);
1639   setTargetDAGCombine(ISD::ANY_EXTEND);
1640   setTargetDAGCombine(ISD::SIGN_EXTEND);
1641   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1642   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1643   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1644   setTargetDAGCombine(ISD::SINT_TO_FP);
1645   setTargetDAGCombine(ISD::UINT_TO_FP);
1646   setTargetDAGCombine(ISD::SETCC);
1647   setTargetDAGCombine(ISD::MUL);
1648   setTargetDAGCombine(ISD::XOR);
1649   setTargetDAGCombine(ISD::MSCATTER);
1650   setTargetDAGCombine(ISD::MGATHER);
1651
1652   computeRegisterProperties(Subtarget.getRegisterInfo());
1653
1654   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1655   MaxStoresPerMemsetOptSize = 8;
1656   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1657   MaxStoresPerMemcpyOptSize = 4;
1658   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1659   MaxStoresPerMemmoveOptSize = 4;
1660   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1661   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1662
1663   // An out-of-order CPU can speculatively execute past a predictable branch,
1664   // but a conditional move could be stalled by an expensive earlier operation.
1665   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1666   EnableExtLdPromotion = true;
1667   setPrefFunctionAlignment(4); // 2^4 bytes.
1668
1669   verifyIntrinsicTables();
1670 }
1671
1672 // This has so far only been implemented for 64-bit MachO.
1673 bool X86TargetLowering::useLoadStackGuardNode() const {
1674   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1675 }
1676
1677 TargetLoweringBase::LegalizeTypeAction
1678 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1679   if (ExperimentalVectorWideningLegalization &&
1680       VT.getVectorNumElements() != 1 &&
1681       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1682     return TypeWidenVector;
1683
1684   return TargetLoweringBase::getPreferredVectorAction(VT);
1685 }
1686
1687 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1688                                           LLVMContext& Context,
1689                                           EVT VT) const {
1690   if (!VT.isVector())
1691     return MVT::i8;
1692
1693   if (VT.isSimple()) {
1694     MVT VVT = VT.getSimpleVT();
1695     const unsigned NumElts = VVT.getVectorNumElements();
1696     MVT EltVT = VVT.getVectorElementType();
1697     if (VVT.is512BitVector()) {
1698       if (Subtarget.hasAVX512())
1699         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1700             EltVT == MVT::f32 || EltVT == MVT::f64)
1701           switch(NumElts) {
1702           case  8: return MVT::v8i1;
1703           case 16: return MVT::v16i1;
1704         }
1705       if (Subtarget.hasBWI())
1706         if (EltVT == MVT::i8 || EltVT == MVT::i16)
1707           switch(NumElts) {
1708           case 32: return MVT::v32i1;
1709           case 64: return MVT::v64i1;
1710         }
1711     }
1712
1713     if (Subtarget.hasBWI() && Subtarget.hasVLX())
1714       return MVT::getVectorVT(MVT::i1, NumElts);
1715
1716     if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1717       EVT LegalVT = getTypeToTransformTo(Context, VT);
1718       EltVT = LegalVT.getVectorElementType().getSimpleVT();
1719     }
1720
1721     if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1722       switch(NumElts) {
1723       case 2: return MVT::v2i1;
1724       case 4: return MVT::v4i1;
1725       case 8: return MVT::v8i1;
1726       }
1727   }
1728
1729   return VT.changeVectorElementTypeToInteger();
1730 }
1731
1732 /// Helper for getByValTypeAlignment to determine
1733 /// the desired ByVal argument alignment.
1734 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1735   if (MaxAlign == 16)
1736     return;
1737   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1738     if (VTy->getBitWidth() == 128)
1739       MaxAlign = 16;
1740   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1741     unsigned EltAlign = 0;
1742     getMaxByValAlign(ATy->getElementType(), EltAlign);
1743     if (EltAlign > MaxAlign)
1744       MaxAlign = EltAlign;
1745   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1746     for (auto *EltTy : STy->elements()) {
1747       unsigned EltAlign = 0;
1748       getMaxByValAlign(EltTy, EltAlign);
1749       if (EltAlign > MaxAlign)
1750         MaxAlign = EltAlign;
1751       if (MaxAlign == 16)
1752         break;
1753     }
1754   }
1755 }
1756
1757 /// Return the desired alignment for ByVal aggregate
1758 /// function arguments in the caller parameter area. For X86, aggregates
1759 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1760 /// are at 4-byte boundaries.
1761 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1762                                                   const DataLayout &DL) const {
1763   if (Subtarget.is64Bit()) {
1764     // Max of 8 and alignment of type.
1765     unsigned TyAlign = DL.getABITypeAlignment(Ty);
1766     if (TyAlign > 8)
1767       return TyAlign;
1768     return 8;
1769   }
1770
1771   unsigned Align = 4;
1772   if (Subtarget.hasSSE1())
1773     getMaxByValAlign(Ty, Align);
1774   return Align;
1775 }
1776
1777 /// Returns the target specific optimal type for load
1778 /// and store operations as a result of memset, memcpy, and memmove
1779 /// lowering. If DstAlign is zero that means it's safe to destination
1780 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1781 /// means there isn't a need to check it against alignment requirement,
1782 /// probably because the source does not need to be loaded. If 'IsMemset' is
1783 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1784 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1785 /// source is constant so it does not need to be loaded.
1786 /// It returns EVT::Other if the type should be determined using generic
1787 /// target-independent logic.
1788 EVT
1789 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1790                                        unsigned DstAlign, unsigned SrcAlign,
1791                                        bool IsMemset, bool ZeroMemset,
1792                                        bool MemcpyStrSrc,
1793                                        MachineFunction &MF) const {
1794   const Function *F = MF.getFunction();
1795   if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1796     if (Size >= 16 &&
1797         (!Subtarget.isUnalignedMem16Slow() ||
1798          ((DstAlign == 0 || DstAlign >= 16) &&
1799           (SrcAlign == 0 || SrcAlign >= 16)))) {
1800       // FIXME: Check if unaligned 32-byte accesses are slow.
1801       if (Size >= 32 && Subtarget.hasAVX()) {
1802         // Although this isn't a well-supported type for AVX1, we'll let
1803         // legalization and shuffle lowering produce the optimal codegen. If we
1804         // choose an optimal type with a vector element larger than a byte,
1805         // getMemsetStores() may create an intermediate splat (using an integer
1806         // multiply) before we splat as a vector.
1807         return MVT::v32i8;
1808       }
1809       if (Subtarget.hasSSE2())
1810         return MVT::v16i8;
1811       // TODO: Can SSE1 handle a byte vector?
1812       if (Subtarget.hasSSE1())
1813         return MVT::v4f32;
1814     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1815                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1816       // Do not use f64 to lower memcpy if source is string constant. It's
1817       // better to use i32 to avoid the loads.
1818       // Also, do not use f64 to lower memset unless this is a memset of zeros.
1819       // The gymnastics of splatting a byte value into an XMM register and then
1820       // only using 8-byte stores (because this is a CPU with slow unaligned
1821       // 16-byte accesses) makes that a loser.
1822       return MVT::f64;
1823     }
1824   }
1825   // This is a compromise. If we reach here, unaligned accesses may be slow on
1826   // this target. However, creating smaller, aligned accesses could be even
1827   // slower and would certainly be a lot more code.
1828   if (Subtarget.is64Bit() && Size >= 8)
1829     return MVT::i64;
1830   return MVT::i32;
1831 }
1832
1833 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1834   if (VT == MVT::f32)
1835     return X86ScalarSSEf32;
1836   else if (VT == MVT::f64)
1837     return X86ScalarSSEf64;
1838   return true;
1839 }
1840
1841 bool
1842 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1843                                                   unsigned,
1844                                                   unsigned,
1845                                                   bool *Fast) const {
1846   if (Fast) {
1847     switch (VT.getSizeInBits()) {
1848     default:
1849       // 8-byte and under are always assumed to be fast.
1850       *Fast = true;
1851       break;
1852     case 128:
1853       *Fast = !Subtarget.isUnalignedMem16Slow();
1854       break;
1855     case 256:
1856       *Fast = !Subtarget.isUnalignedMem32Slow();
1857       break;
1858     // TODO: What about AVX-512 (512-bit) accesses?
1859     }
1860   }
1861   // Misaligned accesses of any size are always allowed.
1862   return true;
1863 }
1864
1865 /// Return the entry encoding for a jump table in the
1866 /// current function.  The returned value is a member of the
1867 /// MachineJumpTableInfo::JTEntryKind enum.
1868 unsigned X86TargetLowering::getJumpTableEncoding() const {
1869   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1870   // symbol.
1871   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1872     return MachineJumpTableInfo::EK_Custom32;
1873
1874   // Otherwise, use the normal jump table encoding heuristics.
1875   return TargetLowering::getJumpTableEncoding();
1876 }
1877
1878 bool X86TargetLowering::useSoftFloat() const {
1879   return Subtarget.useSoftFloat();
1880 }
1881
1882 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1883                                               ArgListTy &Args) const {
1884
1885   // Only relabel X86-32 for C / Stdcall CCs.
1886   if (Subtarget.is64Bit())
1887     return;
1888   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1889     return;
1890   unsigned ParamRegs = 0;
1891   if (auto *M = MF->getFunction()->getParent())
1892     ParamRegs = M->getNumberRegisterParameters();
1893
1894   // Mark the first N int arguments as having reg
1895   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1896     Type *T = Args[Idx].Ty;
1897     if (T->isPointerTy() || T->isIntegerTy())
1898       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1899         unsigned numRegs = 1;
1900         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1901           numRegs = 2;
1902         if (ParamRegs < numRegs)
1903           return;
1904         ParamRegs -= numRegs;
1905         Args[Idx].IsInReg = true;
1906       }
1907   }
1908 }
1909
1910 const MCExpr *
1911 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1912                                              const MachineBasicBlock *MBB,
1913                                              unsigned uid,MCContext &Ctx) const{
1914   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1916   // entries.
1917   return MCSymbolRefExpr::create(MBB->getSymbol(),
1918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1919 }
1920
1921 /// Returns relocation base for the given PIC jumptable.
1922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1923                                                     SelectionDAG &DAG) const {
1924   if (!Subtarget.is64Bit())
1925     // This doesn't have SDLoc associated with it, but is not really the
1926     // same as a Register.
1927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1928                        getPointerTy(DAG.getDataLayout()));
1929   return Table;
1930 }
1931
1932 /// This returns the relocation base for the given PIC jumptable,
1933 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1934 const MCExpr *X86TargetLowering::
1935 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1936                              MCContext &Ctx) const {
1937   // X86-64 uses RIP relative addressing based on the jump table label.
1938   if (Subtarget.isPICStyleRIPRel())
1939     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1940
1941   // Otherwise, the reference is relative to the PIC base.
1942   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1943 }
1944
1945 std::pair<const TargetRegisterClass *, uint8_t>
1946 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1947                                            MVT VT) const {
1948   const TargetRegisterClass *RRC = nullptr;
1949   uint8_t Cost = 1;
1950   switch (VT.SimpleTy) {
1951   default:
1952     return TargetLowering::findRepresentativeClass(TRI, VT);
1953   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1954     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1955     break;
1956   case MVT::x86mmx:
1957     RRC = &X86::VR64RegClass;
1958     break;
1959   case MVT::f32: case MVT::f64:
1960   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1961   case MVT::v4f32: case MVT::v2f64:
1962   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1963   case MVT::v8f32: case MVT::v4f64:
1964   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1965   case MVT::v16f32: case MVT::v8f64:
1966     RRC = &X86::VR128XRegClass;
1967     break;
1968   }
1969   return std::make_pair(RRC, Cost);
1970 }
1971
1972 unsigned X86TargetLowering::getAddressSpace() const {
1973   if (Subtarget.is64Bit())
1974     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1975   return 256;
1976 }
1977
1978 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1979   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1980          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
1981 }
1982
1983 static Constant* SegmentOffset(IRBuilder<> &IRB,
1984                                unsigned Offset, unsigned AddressSpace) {
1985   return ConstantExpr::getIntToPtr(
1986       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
1987       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
1988 }
1989
1990 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
1991   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
1992   // tcbhead_t; use it instead of the usual global variable (see
1993   // sysdeps/{i386,x86_64}/nptl/tls.h)
1994   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
1995     if (Subtarget.isTargetFuchsia()) {
1996       // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
1997       return SegmentOffset(IRB, 0x10, getAddressSpace());
1998     } else {
1999       // %fs:0x28, unless we're using a Kernel code model, in which case
2000       // it's %gs:0x28.  gs:0x14 on i386.
2001       unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2002       return SegmentOffset(IRB, Offset, getAddressSpace());
2003     }
2004   }
2005
2006   return TargetLowering::getIRStackGuard(IRB);
2007 }
2008
2009 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2010   // MSVC CRT provides functionalities for stack protection.
2011   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2012     // MSVC CRT has a global variable holding security cookie.
2013     M.getOrInsertGlobal("__security_cookie",
2014                         Type::getInt8PtrTy(M.getContext()));
2015
2016     // MSVC CRT has a function to validate security cookie.
2017     auto *SecurityCheckCookie = cast<Function>(
2018         M.getOrInsertFunction("__security_check_cookie",
2019                               Type::getVoidTy(M.getContext()),
2020                               Type::getInt8PtrTy(M.getContext())));
2021     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2022     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2023     return;
2024   }
2025   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2026   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2027     return;
2028   TargetLowering::insertSSPDeclarations(M);
2029 }
2030
2031 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2032   // MSVC CRT has a global variable holding security cookie.
2033   if (Subtarget.getTargetTriple().isOSMSVCRT())
2034     return M.getGlobalVariable("__security_cookie");
2035   return TargetLowering::getSDagStackGuard(M);
2036 }
2037
2038 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2039   // MSVC CRT has a function to validate security cookie.
2040   if (Subtarget.getTargetTriple().isOSMSVCRT())
2041     return M.getFunction("__security_check_cookie");
2042   return TargetLowering::getSSPStackGuardCheck(M);
2043 }
2044
2045 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2046   if (Subtarget.getTargetTriple().isOSContiki())
2047     return getDefaultSafeStackPointerLocation(IRB, false);
2048
2049   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2050   // definition of TLS_SLOT_SAFESTACK in
2051   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2052   if (Subtarget.isTargetAndroid()) {
2053     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2054     // %gs:0x24 on i386
2055     unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2056     return SegmentOffset(IRB, Offset, getAddressSpace());
2057   }
2058
2059   // Fuchsia is similar.
2060   if (Subtarget.isTargetFuchsia()) {
2061     // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2062     return SegmentOffset(IRB, 0x18, getAddressSpace());
2063   }
2064
2065   return TargetLowering::getSafeStackPointerLocation(IRB);
2066 }
2067
2068 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2069                                             unsigned DestAS) const {
2070   assert(SrcAS != DestAS && "Expected different address spaces!");
2071
2072   return SrcAS < 256 && DestAS < 256;
2073 }
2074
2075 //===----------------------------------------------------------------------===//
2076 //               Return Value Calling Convention Implementation
2077 //===----------------------------------------------------------------------===//
2078
2079 #include "X86GenCallingConv.inc"
2080
2081 bool X86TargetLowering::CanLowerReturn(
2082     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2083     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2084   SmallVector<CCValAssign, 16> RVLocs;
2085   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2086   return CCInfo.CheckReturn(Outs, RetCC_X86);
2087 }
2088
2089 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2090   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2091   return ScratchRegs;
2092 }
2093
2094 /// Lowers masks values (v*i1) to the local register values
2095 /// \returns DAG node after lowering to register type
2096 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2097                                const SDLoc &Dl, SelectionDAG &DAG) {
2098   EVT ValVT = ValArg.getValueType();
2099
2100   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2101       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2102     // Two stage lowering might be required
2103     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2104     // anyextend: i8   -> i32 / i16   -> i32
2105     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2106     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2107     if (ValLoc == MVT::i32)
2108       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2109     return ValToCopy;
2110   } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2111              (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2112     // One stage lowering is required
2113     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2114     return DAG.getBitcast(ValLoc, ValArg);
2115   } else
2116     return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2117 }
2118
2119 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2120 static void Passv64i1ArgInRegs(
2121     const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2122     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2123     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2124   assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2125          "Expected AVX512BW or AVX512BMI target!");
2126   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2127   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2128   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2129          "The value should reside in two registers");
2130
2131   // Before splitting the value we cast it to i64
2132   Arg = DAG.getBitcast(MVT::i64, Arg);
2133
2134   // Splitting the value into two i32 types
2135   SDValue Lo, Hi;
2136   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2137                    DAG.getConstant(0, Dl, MVT::i32));
2138   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2139                    DAG.getConstant(1, Dl, MVT::i32));
2140
2141   // Attach the two i32 types into corresponding registers
2142   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2143   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2144 }
2145
2146 SDValue
2147 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2148                                bool isVarArg,
2149                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2150                                const SmallVectorImpl<SDValue> &OutVals,
2151                                const SDLoc &dl, SelectionDAG &DAG) const {
2152   MachineFunction &MF = DAG.getMachineFunction();
2153   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2154
2155   // In some cases we need to disable registers from the default CSR list.
2156   // For example, when they are used for argument passing.
2157   bool ShouldDisableCalleeSavedRegister =
2158       CallConv == CallingConv::X86_RegCall ||
2159       MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2160
2161   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2162     report_fatal_error("X86 interrupts may not return any value");
2163
2164   SmallVector<CCValAssign, 16> RVLocs;
2165   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2166   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2167
2168   SDValue Flag;
2169   SmallVector<SDValue, 6> RetOps;
2170   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2171   // Operand #1 = Bytes To Pop
2172   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2173                    MVT::i32));
2174
2175   // Copy the result values into the output registers.
2176   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2177        ++I, ++OutsIndex) {
2178     CCValAssign &VA = RVLocs[I];
2179     assert(VA.isRegLoc() && "Can only return in registers!");
2180
2181     // Add the register to the CalleeSaveDisableRegs list.
2182     if (ShouldDisableCalleeSavedRegister)
2183       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2184
2185     SDValue ValToCopy = OutVals[OutsIndex];
2186     EVT ValVT = ValToCopy.getValueType();
2187
2188     // Promote values to the appropriate types.
2189     if (VA.getLocInfo() == CCValAssign::SExt)
2190       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2191     else if (VA.getLocInfo() == CCValAssign::ZExt)
2192       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2193     else if (VA.getLocInfo() == CCValAssign::AExt) {
2194       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2195         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2196       else
2197         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2198     }
2199     else if (VA.getLocInfo() == CCValAssign::BCvt)
2200       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2201
2202     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2203            "Unexpected FP-extend for return value.");
2204
2205     // If this is x86-64, and we disabled SSE, we can't return FP values,
2206     // or SSE or MMX vectors.
2207     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2208          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2209         (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2210       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2211       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2212     } else if (ValVT == MVT::f64 &&
2213                (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2214       // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2215       // llvm-gcc has never done it right and no one has noticed, so this
2216       // should be OK for now.
2217       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2218       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2219     }
2220
2221     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2222     // the RET instruction and handled by the FP Stackifier.
2223     if (VA.getLocReg() == X86::FP0 ||
2224         VA.getLocReg() == X86::FP1) {
2225       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2226       // change the value to the FP stack register class.
2227       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2228         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2229       RetOps.push_back(ValToCopy);
2230       // Don't emit a copytoreg.
2231       continue;
2232     }
2233
2234     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2235     // which is returned in RAX / RDX.
2236     if (Subtarget.is64Bit()) {
2237       if (ValVT == MVT::x86mmx) {
2238         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2239           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2240           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2241                                   ValToCopy);
2242           // If we don't have SSE2 available, convert to v4f32 so the generated
2243           // register is legal.
2244           if (!Subtarget.hasSSE2())
2245             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2246         }
2247       }
2248     }
2249
2250     SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2251
2252     if (VA.needsCustom()) {
2253       assert(VA.getValVT() == MVT::v64i1 &&
2254              "Currently the only custom case is when we split v64i1 to 2 regs");
2255
2256       Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2257                          Subtarget);
2258
2259       assert(2 == RegsToPass.size() &&
2260              "Expecting two registers after Pass64BitArgInRegs");
2261
2262       // Add the second register to the CalleeSaveDisableRegs list.
2263       if (ShouldDisableCalleeSavedRegister)
2264         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2265     } else {
2266       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2267     }
2268
2269     // Add nodes to the DAG and add the values into the RetOps list
2270     for (auto &Reg : RegsToPass) {
2271       Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2272       Flag = Chain.getValue(1);
2273       RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2274     }
2275   }
2276
2277   // Swift calling convention does not require we copy the sret argument
2278   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2279
2280   // All x86 ABIs require that for returning structs by value we copy
2281   // the sret argument into %rax/%eax (depending on ABI) for the return.
2282   // We saved the argument into a virtual register in the entry block,
2283   // so now we copy the value out and into %rax/%eax.
2284   //
2285   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2286   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2287   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2288   // either case FuncInfo->setSRetReturnReg() will have been called.
2289   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2290     // When we have both sret and another return value, we should use the
2291     // original Chain stored in RetOps[0], instead of the current Chain updated
2292     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2293
2294     // For the case of sret and another return value, we have
2295     //   Chain_0 at the function entry
2296     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2297     // If we use Chain_1 in getCopyFromReg, we will have
2298     //   Val = getCopyFromReg(Chain_1)
2299     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2300
2301     // getCopyToReg(Chain_0) will be glued together with
2302     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2303     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2304     //   Data dependency from Unit B to Unit A due to usage of Val in
2305     //     getCopyToReg(Chain_1, Val)
2306     //   Chain dependency from Unit A to Unit B
2307
2308     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2309     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2310                                      getPointerTy(MF.getDataLayout()));
2311
2312     unsigned RetValReg
2313         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2314           X86::RAX : X86::EAX;
2315     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2316     Flag = Chain.getValue(1);
2317
2318     // RAX/EAX now acts like a return value.
2319     RetOps.push_back(
2320         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2321
2322     // Add the returned register to the CalleeSaveDisableRegs list.
2323     if (ShouldDisableCalleeSavedRegister)
2324       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2325   }
2326
2327   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2328   const MCPhysReg *I =
2329       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2330   if (I) {
2331     for (; *I; ++I) {
2332       if (X86::GR64RegClass.contains(*I))
2333         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2334       else
2335         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2336     }
2337   }
2338
2339   RetOps[0] = Chain;  // Update chain.
2340
2341   // Add the flag if we have it.
2342   if (Flag.getNode())
2343     RetOps.push_back(Flag);
2344
2345   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2346   if (CallConv == CallingConv::X86_INTR)
2347     opcode = X86ISD::IRET;
2348   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2349 }
2350
2351 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2352   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2353     return false;
2354
2355   SDValue TCChain = Chain;
2356   SDNode *Copy = *N->use_begin();
2357   if (Copy->getOpcode() == ISD::CopyToReg) {
2358     // If the copy has a glue operand, we conservatively assume it isn't safe to
2359     // perform a tail call.
2360     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2361       return false;
2362     TCChain = Copy->getOperand(0);
2363   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2364     return false;
2365
2366   bool HasRet = false;
2367   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2368        UI != UE; ++UI) {
2369     if (UI->getOpcode() != X86ISD::RET_FLAG)
2370       return false;
2371     // If we are returning more than one value, we can definitely
2372     // not make a tail call see PR19530
2373     if (UI->getNumOperands() > 4)
2374       return false;
2375     if (UI->getNumOperands() == 4 &&
2376         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2377       return false;
2378     HasRet = true;
2379   }
2380
2381   if (!HasRet)
2382     return false;
2383
2384   Chain = TCChain;
2385   return true;
2386 }
2387
2388 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2389                                            ISD::NodeType ExtendKind) const {
2390   MVT ReturnMVT = MVT::i32;
2391
2392   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2393   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2394     // The ABI does not require i1, i8 or i16 to be extended.
2395     //
2396     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2397     // always extending i8/i16 return values, so keep doing that for now.
2398     // (PR26665).
2399     ReturnMVT = MVT::i8;
2400   }
2401
2402   EVT MinVT = getRegisterType(Context, ReturnMVT);
2403   return VT.bitsLT(MinVT) ? MinVT : VT;
2404 }
2405
2406 /// Reads two 32 bit registers and creates a 64 bit mask value.
2407 /// \param VA The current 32 bit value that need to be assigned.
2408 /// \param NextVA The next 32 bit value that need to be assigned.
2409 /// \param Root The parent DAG node.
2410 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2411 ///                        glue purposes. In the case the DAG is already using
2412 ///                        physical register instead of virtual, we should glue
2413 ///                        our new SDValue to InFlag SDvalue.
2414 /// \return a new SDvalue of size 64bit.
2415 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2416                                 SDValue &Root, SelectionDAG &DAG,
2417                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
2418                                 SDValue *InFlag = nullptr) {
2419   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2420   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2421   assert(VA.getValVT() == MVT::v64i1 &&
2422          "Expecting first location of 64 bit width type");
2423   assert(NextVA.getValVT() == VA.getValVT() &&
2424          "The locations should have the same type");
2425   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2426          "The values should reside in two registers");
2427
2428   SDValue Lo, Hi;
2429   unsigned Reg;
2430   SDValue ArgValueLo, ArgValueHi;
2431
2432   MachineFunction &MF = DAG.getMachineFunction();
2433   const TargetRegisterClass *RC = &X86::GR32RegClass;
2434
2435   // Read a 32 bit value from the registers
2436   if (nullptr == InFlag) {
2437     // When no physical register is present,
2438     // create an intermediate virtual register
2439     Reg = MF.addLiveIn(VA.getLocReg(), RC);
2440     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2441     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2442     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2443   } else {
2444     // When a physical register is available read the value from it and glue
2445     // the reads together.
2446     ArgValueLo =
2447       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2448     *InFlag = ArgValueLo.getValue(2);
2449     ArgValueHi =
2450       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2451     *InFlag = ArgValueHi.getValue(2);
2452   }
2453
2454   // Convert the i32 type into v32i1 type
2455   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2456
2457   // Convert the i32 type into v32i1 type
2458   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2459
2460   // Concatenate the two values together
2461   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2462 }
2463
2464 /// The function will lower a register of various sizes (8/16/32/64)
2465 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2466 /// \returns a DAG node contains the operand after lowering to mask type.
2467 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2468                                const EVT &ValLoc, const SDLoc &Dl,
2469                                SelectionDAG &DAG) {
2470   SDValue ValReturned = ValArg;
2471
2472   if (ValVT == MVT::v1i1)
2473     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2474
2475   if (ValVT == MVT::v64i1) {
2476     // In 32 bit machine, this case is handled by getv64i1Argument
2477     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2478     // In 64 bit machine, There is no need to truncate the value only bitcast
2479   } else {
2480     MVT maskLen;
2481     switch (ValVT.getSimpleVT().SimpleTy) {
2482     case MVT::v8i1:
2483       maskLen = MVT::i8;
2484       break;
2485     case MVT::v16i1:
2486       maskLen = MVT::i16;
2487       break;
2488     case MVT::v32i1:
2489       maskLen = MVT::i32;
2490       break;
2491     default:
2492       llvm_unreachable("Expecting a vector of i1 types");
2493     }
2494
2495     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2496   }
2497   return DAG.getBitcast(ValVT, ValReturned);
2498 }
2499
2500 /// Lower the result values of a call into the
2501 /// appropriate copies out of appropriate physical registers.
2502 ///
2503 SDValue X86TargetLowering::LowerCallResult(
2504     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2505     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2506     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2507     uint32_t *RegMask) const {
2508
2509   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2510   // Assign locations to each value returned by this call.
2511   SmallVector<CCValAssign, 16> RVLocs;
2512   bool Is64Bit = Subtarget.is64Bit();
2513   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2514                  *DAG.getContext());
2515   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2516
2517   // Copy all of the result registers out of their specified physreg.
2518   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2519        ++I, ++InsIndex) {
2520     CCValAssign &VA = RVLocs[I];
2521     EVT CopyVT = VA.getLocVT();
2522
2523     // In some calling conventions we need to remove the used registers
2524     // from the register mask.
2525     if (RegMask) {
2526       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2527            SubRegs.isValid(); ++SubRegs)
2528         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2529     }
2530
2531     // If this is x86-64, and we disabled SSE, we can't return FP values
2532     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2533         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2534       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2535       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2536     }
2537
2538     // If we prefer to use the value in xmm registers, copy it out as f80 and
2539     // use a truncate to move it from fp stack reg to xmm reg.
2540     bool RoundAfterCopy = false;
2541     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2542         isScalarFPTypeInSSEReg(VA.getValVT())) {
2543       if (!Subtarget.hasX87())
2544         report_fatal_error("X87 register return with X87 disabled");
2545       CopyVT = MVT::f80;
2546       RoundAfterCopy = (CopyVT != VA.getLocVT());
2547     }
2548
2549     SDValue Val;
2550     if (VA.needsCustom()) {
2551       assert(VA.getValVT() == MVT::v64i1 &&
2552              "Currently the only custom case is when we split v64i1 to 2 regs");
2553       Val =
2554           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2555     } else {
2556       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2557                   .getValue(1);
2558       Val = Chain.getValue(0);
2559       InFlag = Chain.getValue(2);
2560     }
2561
2562     if (RoundAfterCopy)
2563       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2564                         // This truncation won't change the value.
2565                         DAG.getIntPtrConstant(1, dl));
2566
2567     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2568       if (VA.getValVT().isVector() &&
2569           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2570            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2571         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2572         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2573       } else
2574         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2575     }
2576
2577     InVals.push_back(Val);
2578   }
2579
2580   return Chain;
2581 }
2582
2583 //===----------------------------------------------------------------------===//
2584 //                C & StdCall & Fast Calling Convention implementation
2585 //===----------------------------------------------------------------------===//
2586 //  StdCall calling convention seems to be standard for many Windows' API
2587 //  routines and around. It differs from C calling convention just a little:
2588 //  callee should clean up the stack, not caller. Symbols should be also
2589 //  decorated in some fancy way :) It doesn't support any vector arguments.
2590 //  For info on fast calling convention see Fast Calling Convention (tail call)
2591 //  implementation LowerX86_32FastCCCallTo.
2592
2593 /// CallIsStructReturn - Determines whether a call uses struct return
2594 /// semantics.
2595 enum StructReturnType {
2596   NotStructReturn,
2597   RegStructReturn,
2598   StackStructReturn
2599 };
2600 static StructReturnType
2601 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2602   if (Outs.empty())
2603     return NotStructReturn;
2604
2605   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2606   if (!Flags.isSRet())
2607     return NotStructReturn;
2608   if (Flags.isInReg() || IsMCU)
2609     return RegStructReturn;
2610   return StackStructReturn;
2611 }
2612
2613 /// Determines whether a function uses struct return semantics.
2614 static StructReturnType
2615 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2616   if (Ins.empty())
2617     return NotStructReturn;
2618
2619   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2620   if (!Flags.isSRet())
2621     return NotStructReturn;
2622   if (Flags.isInReg() || IsMCU)
2623     return RegStructReturn;
2624   return StackStructReturn;
2625 }
2626
2627 /// Make a copy of an aggregate at address specified by "Src" to address
2628 /// "Dst" with size and alignment information specified by the specific
2629 /// parameter attribute. The copy will be passed as a byval function parameter.
2630 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2631                                          SDValue Chain, ISD::ArgFlagsTy Flags,
2632                                          SelectionDAG &DAG, const SDLoc &dl) {
2633   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2634
2635   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2636                        /*isVolatile*/false, /*AlwaysInline=*/true,
2637                        /*isTailCall*/false,
2638                        MachinePointerInfo(), MachinePointerInfo());
2639 }
2640
2641 /// Return true if the calling convention is one that we can guarantee TCO for.
2642 static bool canGuaranteeTCO(CallingConv::ID CC) {
2643   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2644           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2645           CC == CallingConv::HHVM);
2646 }
2647
2648 /// Return true if we might ever do TCO for calls with this calling convention.
2649 static bool mayTailCallThisCC(CallingConv::ID CC) {
2650   switch (CC) {
2651   // C calling conventions:
2652   case CallingConv::C:
2653   case CallingConv::X86_64_Win64:
2654   case CallingConv::X86_64_SysV:
2655   // Callee pop conventions:
2656   case CallingConv::X86_ThisCall:
2657   case CallingConv::X86_StdCall:
2658   case CallingConv::X86_VectorCall:
2659   case CallingConv::X86_FastCall:
2660     return true;
2661   default:
2662     return canGuaranteeTCO(CC);
2663   }
2664 }
2665
2666 /// Return true if the function is being made into a tailcall target by
2667 /// changing its ABI.
2668 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2669   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2670 }
2671
2672 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2673   auto Attr =
2674       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2675   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2676     return false;
2677
2678   ImmutableCallSite CS(CI);
2679   CallingConv::ID CalleeCC = CS.getCallingConv();
2680   if (!mayTailCallThisCC(CalleeCC))
2681     return false;
2682
2683   return true;
2684 }
2685
2686 SDValue
2687 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2688                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2689                                     const SDLoc &dl, SelectionDAG &DAG,
2690                                     const CCValAssign &VA,
2691                                     MachineFrameInfo &MFI, unsigned i) const {
2692   // Create the nodes corresponding to a load from this parameter slot.
2693   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2694   bool AlwaysUseMutable = shouldGuaranteeTCO(
2695       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2696   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2697   EVT ValVT;
2698   MVT PtrVT = getPointerTy(DAG.getDataLayout());
2699
2700   // If value is passed by pointer we have address passed instead of the value
2701   // itself. No need to extend if the mask value and location share the same
2702   // absolute size.
2703   bool ExtendedInMem =
2704       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2705       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2706
2707   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2708     ValVT = VA.getLocVT();
2709   else
2710     ValVT = VA.getValVT();
2711
2712   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2713   // taken by a return address.
2714   int Offset = 0;
2715   if (CallConv == CallingConv::X86_INTR) {
2716     // X86 interrupts may take one or two arguments.
2717     // On the stack there will be no return address as in regular call.
2718     // Offset of last argument need to be set to -4/-8 bytes.
2719     // Where offset of the first argument out of two, should be set to 0 bytes.
2720     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2721     if (Subtarget.is64Bit() && Ins.size() == 2) {
2722       // The stack pointer needs to be realigned for 64 bit handlers with error
2723       // code, so the argument offset changes by 8 bytes.
2724       Offset += 8;
2725     }
2726   }
2727
2728   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2729   // changed with more analysis.
2730   // In case of tail call optimization mark all arguments mutable. Since they
2731   // could be overwritten by lowering of arguments in case of a tail call.
2732   if (Flags.isByVal()) {
2733     unsigned Bytes = Flags.getByValSize();
2734     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2735     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2736     // Adjust SP offset of interrupt parameter.
2737     if (CallConv == CallingConv::X86_INTR) {
2738       MFI.setObjectOffset(FI, Offset);
2739     }
2740     return DAG.getFrameIndex(FI, PtrVT);
2741   }
2742
2743   // This is an argument in memory. We might be able to perform copy elision.
2744   if (Flags.isCopyElisionCandidate()) {
2745     EVT ArgVT = Ins[i].ArgVT;
2746     SDValue PartAddr;
2747     if (Ins[i].PartOffset == 0) {
2748       // If this is a one-part value or the first part of a multi-part value,
2749       // create a stack object for the entire argument value type and return a
2750       // load from our portion of it. This assumes that if the first part of an
2751       // argument is in memory, the rest will also be in memory.
2752       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2753                                      /*Immutable=*/false);
2754       PartAddr = DAG.getFrameIndex(FI, PtrVT);
2755       return DAG.getLoad(
2756           ValVT, dl, Chain, PartAddr,
2757           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2758     } else {
2759       // This is not the first piece of an argument in memory. See if there is
2760       // already a fixed stack object including this offset. If so, assume it
2761       // was created by the PartOffset == 0 branch above and create a load from
2762       // the appropriate offset into it.
2763       int64_t PartBegin = VA.getLocMemOffset();
2764       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2765       int FI = MFI.getObjectIndexBegin();
2766       for (; MFI.isFixedObjectIndex(FI); ++FI) {
2767         int64_t ObjBegin = MFI.getObjectOffset(FI);
2768         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2769         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2770           break;
2771       }
2772       if (MFI.isFixedObjectIndex(FI)) {
2773         SDValue Addr =
2774             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2775                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2776         return DAG.getLoad(
2777             ValVT, dl, Chain, Addr,
2778             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2779                                               Ins[i].PartOffset));
2780       }
2781     }
2782   }
2783
2784   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2785                                  VA.getLocMemOffset(), isImmutable);
2786
2787   // Set SExt or ZExt flag.
2788   if (VA.getLocInfo() == CCValAssign::ZExt) {
2789     MFI.setObjectZExt(FI, true);
2790   } else if (VA.getLocInfo() == CCValAssign::SExt) {
2791     MFI.setObjectSExt(FI, true);
2792   }
2793
2794   // Adjust SP offset of interrupt parameter.
2795   if (CallConv == CallingConv::X86_INTR) {
2796     MFI.setObjectOffset(FI, Offset);
2797   }
2798
2799   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2800   SDValue Val = DAG.getLoad(
2801       ValVT, dl, Chain, FIN,
2802       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2803   return ExtendedInMem
2804              ? (VA.getValVT().isVector()
2805                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2806                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2807              : Val;
2808 }
2809
2810 // FIXME: Get this from tablegen.
2811 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2812                                                 const X86Subtarget &Subtarget) {
2813   assert(Subtarget.is64Bit());
2814
2815   if (Subtarget.isCallingConvWin64(CallConv)) {
2816     static const MCPhysReg GPR64ArgRegsWin64[] = {
2817       X86::RCX, X86::RDX, X86::R8,  X86::R9
2818     };
2819     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2820   }
2821
2822   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2823     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2824   };
2825   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2826 }
2827
2828 // FIXME: Get this from tablegen.
2829 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2830                                                 CallingConv::ID CallConv,
2831                                                 const X86Subtarget &Subtarget) {
2832   assert(Subtarget.is64Bit());
2833   if (Subtarget.isCallingConvWin64(CallConv)) {
2834     // The XMM registers which might contain var arg parameters are shadowed
2835     // in their paired GPR.  So we only need to save the GPR to their home
2836     // slots.
2837     // TODO: __vectorcall will change this.
2838     return None;
2839   }
2840
2841   const Function *Fn = MF.getFunction();
2842   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2843   bool isSoftFloat = Subtarget.useSoftFloat();
2844   assert(!(isSoftFloat && NoImplicitFloatOps) &&
2845          "SSE register cannot be used when SSE is disabled!");
2846   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2847     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2848     // registers.
2849     return None;
2850
2851   static const MCPhysReg XMMArgRegs64Bit[] = {
2852     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2853     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2854   };
2855   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2856 }
2857
2858 #ifndef NDEBUG
2859 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2860   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2861                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
2862                           return A.getValNo() < B.getValNo();
2863                         });
2864 }
2865 #endif
2866
2867 SDValue X86TargetLowering::LowerFormalArguments(
2868     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2869     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2870     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2871   MachineFunction &MF = DAG.getMachineFunction();
2872   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2873   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2874
2875   const Function *Fn = MF.getFunction();
2876   if (Fn->hasExternalLinkage() &&
2877       Subtarget.isTargetCygMing() &&
2878       Fn->getName() == "main")
2879     FuncInfo->setForceFramePointer(true);
2880
2881   MachineFrameInfo &MFI = MF.getFrameInfo();
2882   bool Is64Bit = Subtarget.is64Bit();
2883   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2884
2885   assert(
2886       !(isVarArg && canGuaranteeTCO(CallConv)) &&
2887       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2888
2889   if (CallConv == CallingConv::X86_INTR) {
2890     bool isLegal = Ins.size() == 1 ||
2891                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2892                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
2893     if (!isLegal)
2894       report_fatal_error("X86 interrupts may take one or two arguments");
2895   }
2896
2897   // Assign locations to all of the incoming arguments.
2898   SmallVector<CCValAssign, 16> ArgLocs;
2899   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2900
2901   // Allocate shadow area for Win64.
2902   if (IsWin64)
2903     CCInfo.AllocateStack(32, 8);
2904
2905   CCInfo.AnalyzeArguments(Ins, CC_X86);
2906
2907   // In vectorcall calling convention a second pass is required for the HVA
2908   // types.
2909   if (CallingConv::X86_VectorCall == CallConv) {
2910     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2911   }
2912
2913   // The next loop assumes that the locations are in the same order of the
2914   // input arguments.
2915   assert(isSortedByValueNo(ArgLocs) &&
2916          "Argument Location list must be sorted before lowering");
2917
2918   SDValue ArgValue;
2919   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2920        ++I, ++InsIndex) {
2921     assert(InsIndex < Ins.size() && "Invalid Ins index");
2922     CCValAssign &VA = ArgLocs[I];
2923
2924     if (VA.isRegLoc()) {
2925       EVT RegVT = VA.getLocVT();
2926       if (VA.needsCustom()) {
2927         assert(
2928             VA.getValVT() == MVT::v64i1 &&
2929             "Currently the only custom case is when we split v64i1 to 2 regs");
2930
2931         // v64i1 values, in regcall calling convention, that are
2932         // compiled to 32 bit arch, are split up into two registers.
2933         ArgValue =
2934             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2935       } else {
2936         const TargetRegisterClass *RC;
2937         if (RegVT == MVT::i32)
2938           RC = &X86::GR32RegClass;
2939         else if (Is64Bit && RegVT == MVT::i64)
2940           RC = &X86::GR64RegClass;
2941         else if (RegVT == MVT::f32)
2942           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2943         else if (RegVT == MVT::f64)
2944           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2945         else if (RegVT == MVT::f80)
2946           RC = &X86::RFP80RegClass;
2947         else if (RegVT == MVT::f128)
2948           RC = &X86::FR128RegClass;
2949         else if (RegVT.is512BitVector())
2950           RC = &X86::VR512RegClass;
2951         else if (RegVT.is256BitVector())
2952           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2953         else if (RegVT.is128BitVector())
2954           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2955         else if (RegVT == MVT::x86mmx)
2956           RC = &X86::VR64RegClass;
2957         else if (RegVT == MVT::v1i1)
2958           RC = &X86::VK1RegClass;
2959         else if (RegVT == MVT::v8i1)
2960           RC = &X86::VK8RegClass;
2961         else if (RegVT == MVT::v16i1)
2962           RC = &X86::VK16RegClass;
2963         else if (RegVT == MVT::v32i1)
2964           RC = &X86::VK32RegClass;
2965         else if (RegVT == MVT::v64i1)
2966           RC = &X86::VK64RegClass;
2967         else
2968           llvm_unreachable("Unknown argument type!");
2969
2970         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2971         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2972       }
2973
2974       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2975       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
2976       // right size.
2977       if (VA.getLocInfo() == CCValAssign::SExt)
2978         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2979                                DAG.getValueType(VA.getValVT()));
2980       else if (VA.getLocInfo() == CCValAssign::ZExt)
2981         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2982                                DAG.getValueType(VA.getValVT()));
2983       else if (VA.getLocInfo() == CCValAssign::BCvt)
2984         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2985
2986       if (VA.isExtInLoc()) {
2987         // Handle MMX values passed in XMM regs.
2988         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2989           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2990         else if (VA.getValVT().isVector() &&
2991                  VA.getValVT().getScalarType() == MVT::i1 &&
2992                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2993                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2994           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2995           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2996         } else
2997           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2998       }
2999     } else {
3000       assert(VA.isMemLoc());
3001       ArgValue =
3002           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3003     }
3004
3005     // If value is passed via pointer - do a load.
3006     if (VA.getLocInfo() == CCValAssign::Indirect)
3007       ArgValue =
3008           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3009
3010     InVals.push_back(ArgValue);
3011   }
3012
3013   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3014     // Swift calling convention does not require we copy the sret argument
3015     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3016     if (CallConv == CallingConv::Swift)
3017       continue;
3018
3019     // All x86 ABIs require that for returning structs by value we copy the
3020     // sret argument into %rax/%eax (depending on ABI) for the return. Save
3021     // the argument into a virtual register so that we can access it from the
3022     // return points.
3023     if (Ins[I].Flags.isSRet()) {
3024       unsigned Reg = FuncInfo->getSRetReturnReg();
3025       if (!Reg) {
3026         MVT PtrTy = getPointerTy(DAG.getDataLayout());
3027         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3028         FuncInfo->setSRetReturnReg(Reg);
3029       }
3030       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3031       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3032       break;
3033     }
3034   }
3035
3036   unsigned StackSize = CCInfo.getNextStackOffset();
3037   // Align stack specially for tail calls.
3038   if (shouldGuaranteeTCO(CallConv,
3039                          MF.getTarget().Options.GuaranteedTailCallOpt))
3040     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3041
3042   // If the function takes variable number of arguments, make a frame index for
3043   // the start of the first vararg value... for expansion of llvm.va_start. We
3044   // can skip this if there are no va_start calls.
3045   if (MFI.hasVAStart() &&
3046       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3047                    CallConv != CallingConv::X86_ThisCall))) {
3048     FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3049   }
3050
3051   // Figure out if XMM registers are in use.
3052   assert(!(Subtarget.useSoftFloat() &&
3053            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3054          "SSE register cannot be used when SSE is disabled!");
3055
3056   // 64-bit calling conventions support varargs and register parameters, so we
3057   // have to do extra work to spill them in the prologue.
3058   if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3059     // Find the first unallocated argument registers.
3060     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3061     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3062     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3063     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3064     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3065            "SSE register cannot be used when SSE is disabled!");
3066
3067     // Gather all the live in physical registers.
3068     SmallVector<SDValue, 6> LiveGPRs;
3069     SmallVector<SDValue, 8> LiveXMMRegs;
3070     SDValue ALVal;
3071     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3072       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3073       LiveGPRs.push_back(
3074           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3075     }
3076     if (!ArgXMMs.empty()) {
3077       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3078       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3079       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3080         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3081         LiveXMMRegs.push_back(
3082             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3083       }
3084     }
3085
3086     if (IsWin64) {
3087       // Get to the caller-allocated home save location.  Add 8 to account
3088       // for the return address.
3089       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3090       FuncInfo->setRegSaveFrameIndex(
3091           MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3092       // Fixup to set vararg frame on shadow area (4 x i64).
3093       if (NumIntRegs < 4)
3094         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3095     } else {
3096       // For X86-64, if there are vararg parameters that are passed via
3097       // registers, then we must store them to their spots on the stack so
3098       // they may be loaded by dereferencing the result of va_next.
3099       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3100       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3101       FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3102           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3103     }
3104
3105     // Store the integer parameter registers.
3106     SmallVector<SDValue, 8> MemOps;
3107     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3108                                       getPointerTy(DAG.getDataLayout()));
3109     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3110     for (SDValue Val : LiveGPRs) {
3111       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3112                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3113       SDValue Store =
3114           DAG.getStore(Val.getValue(1), dl, Val, FIN,
3115                        MachinePointerInfo::getFixedStack(
3116                            DAG.getMachineFunction(),
3117                            FuncInfo->getRegSaveFrameIndex(), Offset));
3118       MemOps.push_back(Store);
3119       Offset += 8;
3120     }
3121
3122     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3123       // Now store the XMM (fp + vector) parameter registers.
3124       SmallVector<SDValue, 12> SaveXMMOps;
3125       SaveXMMOps.push_back(Chain);
3126       SaveXMMOps.push_back(ALVal);
3127       SaveXMMOps.push_back(DAG.getIntPtrConstant(
3128                              FuncInfo->getRegSaveFrameIndex(), dl));
3129       SaveXMMOps.push_back(DAG.getIntPtrConstant(
3130                              FuncInfo->getVarArgsFPOffset(), dl));
3131       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3132                         LiveXMMRegs.end());
3133       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3134                                    MVT::Other, SaveXMMOps));
3135     }
3136
3137     if (!MemOps.empty())
3138       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3139   }
3140
3141   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3142     // Find the largest legal vector type.
3143     MVT VecVT = MVT::Other;
3144     // FIXME: Only some x86_32 calling conventions support AVX512.
3145     if (Subtarget.hasAVX512() &&
3146         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3147                      CallConv == CallingConv::Intel_OCL_BI)))
3148       VecVT = MVT::v16f32;
3149     else if (Subtarget.hasAVX())
3150       VecVT = MVT::v8f32;
3151     else if (Subtarget.hasSSE2())
3152       VecVT = MVT::v4f32;
3153
3154     // We forward some GPRs and some vector types.
3155     SmallVector<MVT, 2> RegParmTypes;
3156     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3157     RegParmTypes.push_back(IntVT);
3158     if (VecVT != MVT::Other)
3159       RegParmTypes.push_back(VecVT);
3160
3161     // Compute the set of forwarded registers. The rest are scratch.
3162     SmallVectorImpl<ForwardedRegister> &Forwards =
3163         FuncInfo->getForwardedMustTailRegParms();
3164     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3165
3166     // Conservatively forward AL on x86_64, since it might be used for varargs.
3167     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3168       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3169       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3170     }
3171
3172     // Copy all forwards from physical to virtual registers.
3173     for (ForwardedRegister &F : Forwards) {
3174       // FIXME: Can we use a less constrained schedule?
3175       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3176       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3177       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3178     }
3179   }
3180
3181   // Some CCs need callee pop.
3182   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3183                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
3184     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3185   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3186     // X86 interrupts must pop the error code (and the alignment padding) if
3187     // present.
3188     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3189   } else {
3190     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3191     // If this is an sret function, the return should pop the hidden pointer.
3192     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3193         !Subtarget.getTargetTriple().isOSMSVCRT() &&
3194         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3195       FuncInfo->setBytesToPopOnReturn(4);
3196   }
3197
3198   if (!Is64Bit) {
3199     // RegSaveFrameIndex is X86-64 only.
3200     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3201     if (CallConv == CallingConv::X86_FastCall ||
3202         CallConv == CallingConv::X86_ThisCall)
3203       // fastcc functions can't have varargs.
3204       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3205   }
3206
3207   FuncInfo->setArgumentStackSize(StackSize);
3208
3209   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3210     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3211     if (Personality == EHPersonality::CoreCLR) {
3212       assert(Is64Bit);
3213       // TODO: Add a mechanism to frame lowering that will allow us to indicate
3214       // that we'd prefer this slot be allocated towards the bottom of the frame
3215       // (i.e. near the stack pointer after allocating the frame).  Every
3216       // funclet needs a copy of this slot in its (mostly empty) frame, and the
3217       // offset from the bottom of this and each funclet's frame must be the
3218       // same, so the size of funclets' (mostly empty) frames is dictated by
3219       // how far this slot is from the bottom (since they allocate just enough
3220       // space to accommodate holding this slot at the correct offset).
3221       int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3222       EHInfo->PSPSymFrameIdx = PSPSymFI;
3223     }
3224   }
3225
3226   if (CallConv == CallingConv::X86_RegCall ||
3227       Fn->hasFnAttribute("no_caller_saved_registers")) {
3228     const MachineRegisterInfo &MRI = MF.getRegInfo();
3229     for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3230       MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3231   }
3232
3233   return Chain;
3234 }
3235
3236 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3237                                             SDValue Arg, const SDLoc &dl,
3238                                             SelectionDAG &DAG,
3239                                             const CCValAssign &VA,
3240                                             ISD::ArgFlagsTy Flags) const {
3241   unsigned LocMemOffset = VA.getLocMemOffset();
3242   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3243   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3244                        StackPtr, PtrOff);
3245   if (Flags.isByVal())
3246     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3247
3248   return DAG.getStore(
3249       Chain, dl, Arg, PtrOff,
3250       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3251 }
3252
3253 /// Emit a load of return address if tail call
3254 /// optimization is performed and it is required.
3255 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3256     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3257     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3258   // Adjust the Return address stack slot.
3259   EVT VT = getPointerTy(DAG.getDataLayout());
3260   OutRetAddr = getReturnAddressFrameIndex(DAG);
3261
3262   // Load the "old" Return address.
3263   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3264   return SDValue(OutRetAddr.getNode(), 1);
3265 }
3266
3267 /// Emit a store of the return address if tail call
3268 /// optimization is performed and it is required (FPDiff!=0).
3269 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3270                                         SDValue Chain, SDValue RetAddrFrIdx,
3271                                         EVT PtrVT, unsigned SlotSize,
3272                                         int FPDiff, const SDLoc &dl) {
3273   // Store the return address to the appropriate stack slot.
3274   if (!FPDiff) return Chain;
3275   // Calculate the new stack slot for the return address.
3276   int NewReturnAddrFI =
3277     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3278                                          false);
3279   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3280   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3281                        MachinePointerInfo::getFixedStack(
3282                            DAG.getMachineFunction(), NewReturnAddrFI));
3283   return Chain;
3284 }
3285
3286 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3287 /// operation of specified width.
3288 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3289                        SDValue V2) {
3290   unsigned NumElems = VT.getVectorNumElements();
3291   SmallVector<int, 8> Mask;
3292   Mask.push_back(NumElems);
3293   for (unsigned i = 1; i != NumElems; ++i)
3294     Mask.push_back(i);
3295   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3296 }
3297
3298 SDValue
3299 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3300                              SmallVectorImpl<SDValue> &InVals) const {
3301   SelectionDAG &DAG                     = CLI.DAG;
3302   SDLoc &dl                             = CLI.DL;
3303   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3304   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3305   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3306   SDValue Chain                         = CLI.Chain;
3307   SDValue Callee                        = CLI.Callee;
3308   CallingConv::ID CallConv              = CLI.CallConv;
3309   bool &isTailCall                      = CLI.IsTailCall;
3310   bool isVarArg                         = CLI.IsVarArg;
3311
3312   MachineFunction &MF = DAG.getMachineFunction();
3313   bool Is64Bit        = Subtarget.is64Bit();
3314   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3315   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3316   bool IsSibcall      = false;
3317   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3318   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3319   const CallInst *CI =
3320       CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3321   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3322   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3323                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3324
3325   if (CallConv == CallingConv::X86_INTR)
3326     report_fatal_error("X86 interrupts may not be called directly");
3327
3328   if (Attr.getValueAsString() == "true")
3329     isTailCall = false;
3330
3331   if (Subtarget.isPICStyleGOT() &&
3332       !MF.getTarget().Options.GuaranteedTailCallOpt) {
3333     // If we are using a GOT, disable tail calls to external symbols with
3334     // default visibility. Tail calling such a symbol requires using a GOT
3335     // relocation, which forces early binding of the symbol. This breaks code
3336     // that require lazy function symbol resolution. Using musttail or
3337     // GuaranteedTailCallOpt will override this.
3338     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3339     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3340                G->getGlobal()->hasDefaultVisibility()))
3341       isTailCall = false;
3342   }
3343
3344   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3345   if (IsMustTail) {
3346     // Force this to be a tail call.  The verifier rules are enough to ensure
3347     // that we can lower this successfully without moving the return address
3348     // around.
3349     isTailCall = true;
3350   } else if (isTailCall) {
3351     // Check if it's really possible to do a tail call.
3352     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3353                     isVarArg, SR != NotStructReturn,
3354                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3355                     Outs, OutVals, Ins, DAG);
3356
3357     // Sibcalls are automatically detected tailcalls which do not require
3358     // ABI changes.
3359     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3360       IsSibcall = true;
3361
3362     if (isTailCall)
3363       ++NumTailCalls;
3364   }
3365
3366   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3367          "Var args not supported with calling convention fastcc, ghc or hipe");
3368
3369   // Analyze operands of the call, assigning locations to each operand.
3370   SmallVector<CCValAssign, 16> ArgLocs;
3371   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3372
3373   // Allocate shadow area for Win64.
3374   if (IsWin64)
3375     CCInfo.AllocateStack(32, 8);
3376
3377   CCInfo.AnalyzeArguments(Outs, CC_X86);
3378
3379   // In vectorcall calling convention a second pass is required for the HVA
3380   // types.
3381   if (CallingConv::X86_VectorCall == CallConv) {
3382     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3383   }
3384
3385   // Get a count of how many bytes are to be pushed on the stack.
3386   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3387   if (IsSibcall)
3388     // This is a sibcall. The memory operands are available in caller's
3389     // own caller's stack.
3390     NumBytes = 0;
3391   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3392            canGuaranteeTCO(CallConv))
3393     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3394
3395   int FPDiff = 0;
3396   if (isTailCall && !IsSibcall && !IsMustTail) {
3397     // Lower arguments at fp - stackoffset + fpdiff.
3398     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3399
3400     FPDiff = NumBytesCallerPushed - NumBytes;
3401
3402     // Set the delta of movement of the returnaddr stackslot.
3403     // But only set if delta is greater than previous delta.
3404     if (FPDiff < X86Info->getTCReturnAddrDelta())
3405       X86Info->setTCReturnAddrDelta(FPDiff);
3406   }
3407
3408   unsigned NumBytesToPush = NumBytes;
3409   unsigned NumBytesToPop = NumBytes;
3410
3411   // If we have an inalloca argument, all stack space has already been allocated
3412   // for us and be right at the top of the stack.  We don't support multiple
3413   // arguments passed in memory when using inalloca.
3414   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3415     NumBytesToPush = 0;
3416     if (!ArgLocs.back().isMemLoc())
3417       report_fatal_error("cannot use inalloca attribute on a register "
3418                          "parameter");
3419     if (ArgLocs.back().getLocMemOffset() != 0)
3420       report_fatal_error("any parameter with the inalloca attribute must be "
3421                          "the only memory argument");
3422   }
3423
3424   if (!IsSibcall)
3425     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3426                                  NumBytes - NumBytesToPush, dl);
3427
3428   SDValue RetAddrFrIdx;
3429   // Load return address for tail calls.
3430   if (isTailCall && FPDiff)
3431     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3432                                     Is64Bit, FPDiff, dl);
3433
3434   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3435   SmallVector<SDValue, 8> MemOpChains;
3436   SDValue StackPtr;
3437
3438   // The next loop assumes that the locations are in the same order of the
3439   // input arguments.
3440   assert(isSortedByValueNo(ArgLocs) &&
3441          "Argument Location list must be sorted before lowering");
3442
3443   // Walk the register/memloc assignments, inserting copies/loads.  In the case
3444   // of tail call optimization arguments are handle later.
3445   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3446   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3447        ++I, ++OutIndex) {
3448     assert(OutIndex < Outs.size() && "Invalid Out index");
3449     // Skip inalloca arguments, they have already been written.
3450     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3451     if (Flags.isInAlloca())
3452       continue;
3453
3454     CCValAssign &VA = ArgLocs[I];
3455     EVT RegVT = VA.getLocVT();
3456     SDValue Arg = OutVals[OutIndex];
3457     bool isByVal = Flags.isByVal();
3458
3459     // Promote the value if needed.
3460     switch (VA.getLocInfo()) {
3461     default: llvm_unreachable("Unknown loc info!");
3462     case CCValAssign::Full: break;
3463     case CCValAssign::SExt:
3464       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3465       break;
3466     case CCValAssign::ZExt:
3467       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3468       break;
3469     case CCValAssign::AExt:
3470       if (Arg.getValueType().isVector() &&
3471           Arg.getValueType().getVectorElementType() == MVT::i1)
3472         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3473       else if (RegVT.is128BitVector()) {
3474         // Special case: passing MMX values in XMM registers.
3475         Arg = DAG.getBitcast(MVT::i64, Arg);
3476         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3477         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3478       } else
3479         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3480       break;
3481     case CCValAssign::BCvt:
3482       Arg = DAG.getBitcast(RegVT, Arg);
3483       break;
3484     case CCValAssign::Indirect: {
3485       // Store the argument.
3486       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3487       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3488       Chain = DAG.getStore(
3489           Chain, dl, Arg, SpillSlot,
3490           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3491       Arg = SpillSlot;
3492       break;
3493     }
3494     }
3495
3496     if (VA.needsCustom()) {
3497       assert(VA.getValVT() == MVT::v64i1 &&
3498              "Currently the only custom case is when we split v64i1 to 2 regs");
3499       // Split v64i1 value into two registers
3500       Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3501                          Subtarget);
3502     } else if (VA.isRegLoc()) {
3503       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3504       if (isVarArg && IsWin64) {
3505         // Win64 ABI requires argument XMM reg to be copied to the corresponding
3506         // shadow reg if callee is a varargs function.
3507         unsigned ShadowReg = 0;
3508         switch (VA.getLocReg()) {
3509         case X86::XMM0: ShadowReg = X86::RCX; break;
3510         case X86::XMM1: ShadowReg = X86::RDX; break;
3511         case X86::XMM2: ShadowReg = X86::R8; break;
3512         case X86::XMM3: ShadowReg = X86::R9; break;
3513         }
3514         if (ShadowReg)
3515           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3516       }
3517     } else if (!IsSibcall && (!isTailCall || isByVal)) {
3518       assert(VA.isMemLoc());
3519       if (!StackPtr.getNode())
3520         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3521                                       getPointerTy(DAG.getDataLayout()));
3522       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3523                                              dl, DAG, VA, Flags));
3524     }
3525   }
3526
3527   if (!MemOpChains.empty())
3528     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3529
3530   if (Subtarget.isPICStyleGOT()) {
3531     // ELF / PIC requires GOT in the EBX register before function calls via PLT
3532     // GOT pointer.
3533     if (!isTailCall) {
3534       RegsToPass.push_back(std::make_pair(
3535           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3536                                           getPointerTy(DAG.getDataLayout()))));
3537     } else {
3538       // If we are tail calling and generating PIC/GOT style code load the
3539       // address of the callee into ECX. The value in ecx is used as target of
3540       // the tail jump. This is done to circumvent the ebx/callee-saved problem
3541       // for tail calls on PIC/GOT architectures. Normally we would just put the
3542       // address of GOT into ebx and then call target@PLT. But for tail calls
3543       // ebx would be restored (since ebx is callee saved) before jumping to the
3544       // target@PLT.
3545
3546       // Note: The actual moving to ECX is done further down.
3547       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3548       if (G && !G->getGlobal()->hasLocalLinkage() &&
3549           G->getGlobal()->hasDefaultVisibility())
3550         Callee = LowerGlobalAddress(Callee, DAG);
3551       else if (isa<ExternalSymbolSDNode>(Callee))
3552         Callee = LowerExternalSymbol(Callee, DAG);
3553     }
3554   }
3555
3556   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3557     // From AMD64 ABI document:
3558     // For calls that may call functions that use varargs or stdargs
3559     // (prototype-less calls or calls to functions containing ellipsis (...) in
3560     // the declaration) %al is used as hidden argument to specify the number
3561     // of SSE registers used. The contents of %al do not need to match exactly
3562     // the number of registers, but must be an ubound on the number of SSE
3563     // registers used and is in the range 0 - 8 inclusive.
3564
3565     // Count the number of XMM registers allocated.
3566     static const MCPhysReg XMMArgRegs[] = {
3567       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3568       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3569     };
3570     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3571     assert((Subtarget.hasSSE1() || !NumXMMRegs)
3572            && "SSE registers cannot be used when SSE is disabled");
3573
3574     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3575                                         DAG.getConstant(NumXMMRegs, dl,
3576                                                         MVT::i8)));
3577   }
3578
3579   if (isVarArg && IsMustTail) {
3580     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3581     for (const auto &F : Forwards) {
3582       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3583       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3584     }
3585   }
3586
3587   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3588   // don't need this because the eligibility check rejects calls that require
3589   // shuffling arguments passed in memory.
3590   if (!IsSibcall && isTailCall) {
3591     // Force all the incoming stack arguments to be loaded from the stack
3592     // before any new outgoing arguments are stored to the stack, because the
3593     // outgoing stack slots may alias the incoming argument stack slots, and
3594     // the alias isn't otherwise explicit. This is slightly more conservative
3595     // than necessary, because it means that each store effectively depends
3596     // on every argument instead of just those arguments it would clobber.
3597     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3598
3599     SmallVector<SDValue, 8> MemOpChains2;
3600     SDValue FIN;
3601     int FI = 0;
3602     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3603          ++I, ++OutsIndex) {
3604       CCValAssign &VA = ArgLocs[I];
3605
3606       if (VA.isRegLoc()) {
3607         if (VA.needsCustom()) {
3608           assert((CallConv == CallingConv::X86_RegCall) &&
3609                  "Expecting custom case only in regcall calling convention");
3610           // This means that we are in special case where one argument was
3611           // passed through two register locations - Skip the next location
3612           ++I;
3613         }
3614
3615         continue;
3616       }
3617
3618       assert(VA.isMemLoc());
3619       SDValue Arg = OutVals[OutsIndex];
3620       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3621       // Skip inalloca arguments.  They don't require any work.
3622       if (Flags.isInAlloca())
3623         continue;
3624       // Create frame index.
3625       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3626       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3627       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3628       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3629
3630       if (Flags.isByVal()) {
3631         // Copy relative to framepointer.
3632         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3633         if (!StackPtr.getNode())
3634           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3635                                         getPointerTy(DAG.getDataLayout()));
3636         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3637                              StackPtr, Source);
3638
3639         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3640                                                          ArgChain,
3641                                                          Flags, DAG, dl));
3642       } else {
3643         // Store relative to framepointer.
3644         MemOpChains2.push_back(DAG.getStore(
3645             ArgChain, dl, Arg, FIN,
3646             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3647       }
3648     }
3649
3650     if (!MemOpChains2.empty())
3651       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3652
3653     // Store the return address to the appropriate stack slot.
3654     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3655                                      getPointerTy(DAG.getDataLayout()),
3656                                      RegInfo->getSlotSize(), FPDiff, dl);
3657   }
3658
3659   // Build a sequence of copy-to-reg nodes chained together with token chain
3660   // and flag operands which copy the outgoing args into registers.
3661   SDValue InFlag;
3662   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3663     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3664                              RegsToPass[i].second, InFlag);
3665     InFlag = Chain.getValue(1);
3666   }
3667
3668   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3669     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3670     // In the 64-bit large code model, we have to make all calls
3671     // through a register, since the call instruction's 32-bit
3672     // pc-relative offset may not be large enough to hold the whole
3673     // address.
3674   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3675     // If the callee is a GlobalAddress node (quite common, every direct call
3676     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3677     // it.
3678     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3679
3680     // We should use extra load for direct calls to dllimported functions in
3681     // non-JIT mode.
3682     const GlobalValue *GV = G->getGlobal();
3683     if (!GV->hasDLLImportStorageClass()) {
3684       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3685
3686       Callee = DAG.getTargetGlobalAddress(
3687           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3688
3689       if (OpFlags == X86II::MO_GOTPCREL) {
3690         // Add a wrapper.
3691         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3692           getPointerTy(DAG.getDataLayout()), Callee);
3693         // Add extra indirection
3694         Callee = DAG.getLoad(
3695             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3696             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3697       }
3698     }
3699   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3700     const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3701     unsigned char OpFlags =
3702         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3703
3704     Callee = DAG.getTargetExternalSymbol(
3705         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3706   } else if (Subtarget.isTarget64BitILP32() &&
3707              Callee->getValueType(0) == MVT::i32) {
3708     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3709     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3710   }
3711
3712   // Returns a chain & a flag for retval copy to use.
3713   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3714   SmallVector<SDValue, 8> Ops;
3715
3716   if (!IsSibcall && isTailCall) {
3717     Chain = DAG.getCALLSEQ_END(Chain,
3718                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3719                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3720     InFlag = Chain.getValue(1);
3721   }
3722
3723   Ops.push_back(Chain);
3724   Ops.push_back(Callee);
3725
3726   if (isTailCall)
3727     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3728
3729   // Add argument registers to the end of the list so that they are known live
3730   // into the call.
3731   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3732     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3733                                   RegsToPass[i].second.getValueType()));
3734
3735   // Add a register mask operand representing the call-preserved registers.
3736   // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3737   // set X86_INTR calling convention because it has the same CSR mask
3738   // (same preserved registers).
3739   const uint32_t *Mask = RegInfo->getCallPreservedMask(
3740       MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3741   assert(Mask && "Missing call preserved mask for calling convention");
3742
3743   // If this is an invoke in a 32-bit function using a funclet-based
3744   // personality, assume the function clobbers all registers. If an exception
3745   // is thrown, the runtime will not restore CSRs.
3746   // FIXME: Model this more precisely so that we can register allocate across
3747   // the normal edge and spill and fill across the exceptional edge.
3748   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3749     const Function *CallerFn = MF.getFunction();
3750     EHPersonality Pers =
3751         CallerFn->hasPersonalityFn()
3752             ? classifyEHPersonality(CallerFn->getPersonalityFn())
3753             : EHPersonality::Unknown;
3754     if (isFuncletEHPersonality(Pers))
3755       Mask = RegInfo->getNoPreservedMask();
3756   }
3757
3758   // Define a new register mask from the existing mask.
3759   uint32_t *RegMask = nullptr;
3760
3761   // In some calling conventions we need to remove the used physical registers
3762   // from the reg mask.
3763   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3764     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3765
3766     // Allocate a new Reg Mask and copy Mask.
3767     RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3768     unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3769     memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3770
3771     // Make sure all sub registers of the argument registers are reset
3772     // in the RegMask.
3773     for (auto const &RegPair : RegsToPass)
3774       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3775            SubRegs.isValid(); ++SubRegs)
3776         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3777
3778     // Create the RegMask Operand according to our updated mask.
3779     Ops.push_back(DAG.getRegisterMask(RegMask));
3780   } else {
3781     // Create the RegMask Operand according to the static mask.
3782     Ops.push_back(DAG.getRegisterMask(Mask));
3783   }
3784
3785   if (InFlag.getNode())
3786     Ops.push_back(InFlag);
3787
3788   if (isTailCall) {
3789     // We used to do:
3790     //// If this is the first return lowered for this function, add the regs
3791     //// to the liveout set for the function.
3792     // This isn't right, although it's probably harmless on x86; liveouts
3793     // should be computed from returns not tail calls.  Consider a void
3794     // function making a tail call to a function returning int.
3795     MF.getFrameInfo().setHasTailCall();
3796     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3797   }
3798
3799   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3800   InFlag = Chain.getValue(1);
3801
3802   // Create the CALLSEQ_END node.
3803   unsigned NumBytesForCalleeToPop;
3804   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3805                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3806     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3807   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3808            !Subtarget.getTargetTriple().isOSMSVCRT() &&
3809            SR == StackStructReturn)
3810     // If this is a call to a struct-return function, the callee
3811     // pops the hidden struct pointer, so we have to push it back.
3812     // This is common for Darwin/X86, Linux & Mingw32 targets.
3813     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3814     NumBytesForCalleeToPop = 4;
3815   else
3816     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3817
3818   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3819     // No need to reset the stack after the call if the call doesn't return. To
3820     // make the MI verify, we'll pretend the callee does it for us.
3821     NumBytesForCalleeToPop = NumBytes;
3822   }
3823
3824   // Returns a flag for retval copy to use.
3825   if (!IsSibcall) {
3826     Chain = DAG.getCALLSEQ_END(Chain,
3827                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3828                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3829                                                      true),
3830                                InFlag, dl);
3831     InFlag = Chain.getValue(1);
3832   }
3833
3834   // Handle result values, copying them out of physregs into vregs that we
3835   // return.
3836   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3837                          InVals, RegMask);
3838 }
3839
3840 //===----------------------------------------------------------------------===//
3841 //                Fast Calling Convention (tail call) implementation
3842 //===----------------------------------------------------------------------===//
3843
3844 //  Like std call, callee cleans arguments, convention except that ECX is
3845 //  reserved for storing the tail called function address. Only 2 registers are
3846 //  free for argument passing (inreg). Tail call optimization is performed
3847 //  provided:
3848 //                * tailcallopt is enabled
3849 //                * caller/callee are fastcc
3850 //  On X86_64 architecture with GOT-style position independent code only local
3851 //  (within module) calls are supported at the moment.
3852 //  To keep the stack aligned according to platform abi the function
3853 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3854 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3855 //  If a tail called function callee has more arguments than the caller the
3856 //  caller needs to make sure that there is room to move the RETADDR to. This is
3857 //  achieved by reserving an area the size of the argument delta right after the
3858 //  original RETADDR, but before the saved framepointer or the spilled registers
3859 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3860 //  stack layout:
3861 //    arg1
3862 //    arg2
3863 //    RETADDR
3864 //    [ new RETADDR
3865 //      move area ]
3866 //    (possible EBP)
3867 //    ESI
3868 //    EDI
3869 //    local1 ..
3870
3871 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3872 /// requirement.
3873 unsigned
3874 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3875                                                SelectionDAG& DAG) const {
3876   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3877   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3878   unsigned StackAlignment = TFI.getStackAlignment();
3879   uint64_t AlignMask = StackAlignment - 1;
3880   int64_t Offset = StackSize;
3881   unsigned SlotSize = RegInfo->getSlotSize();
3882   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3883     // Number smaller than 12 so just add the difference.
3884     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3885   } else {
3886     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3887     Offset = ((~AlignMask) & Offset) + StackAlignment +
3888       (StackAlignment-SlotSize);
3889   }
3890   return Offset;
3891 }
3892
3893 /// Return true if the given stack call argument is already available in the
3894 /// same position (relatively) of the caller's incoming argument stack.
3895 static
3896 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3897                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3898                          const X86InstrInfo *TII, const CCValAssign &VA) {
3899   unsigned Bytes = Arg.getValueSizeInBits() / 8;
3900
3901   for (;;) {
3902     // Look through nodes that don't alter the bits of the incoming value.
3903     unsigned Op = Arg.getOpcode();
3904     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3905       Arg = Arg.getOperand(0);
3906       continue;
3907     }
3908     if (Op == ISD::TRUNCATE) {
3909       const SDValue &TruncInput = Arg.getOperand(0);
3910       if (TruncInput.getOpcode() == ISD::AssertZext &&
3911           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3912               Arg.getValueType()) {
3913         Arg = TruncInput.getOperand(0);
3914         continue;
3915       }
3916     }
3917     break;
3918   }
3919
3920   int FI = INT_MAX;
3921   if (Arg.getOpcode() == ISD::CopyFromReg) {
3922     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3923     if (!TargetRegisterInfo::isVirtualRegister(VR))
3924       return false;
3925     MachineInstr *Def = MRI->getVRegDef(VR);
3926     if (!Def)
3927       return false;
3928     if (!Flags.isByVal()) {
3929       if (!TII->isLoadFromStackSlot(*Def, FI))
3930         return false;
3931     } else {
3932       unsigned Opcode = Def->getOpcode();
3933       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3934            Opcode == X86::LEA64_32r) &&
3935           Def->getOperand(1).isFI()) {
3936         FI = Def->getOperand(1).getIndex();
3937         Bytes = Flags.getByValSize();
3938       } else
3939         return false;
3940     }
3941   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3942     if (Flags.isByVal())
3943       // ByVal argument is passed in as a pointer but it's now being
3944       // dereferenced. e.g.
3945       // define @foo(%struct.X* %A) {
3946       //   tail call @bar(%struct.X* byval %A)
3947       // }
3948       return false;
3949     SDValue Ptr = Ld->getBasePtr();
3950     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3951     if (!FINode)
3952       return false;
3953     FI = FINode->getIndex();
3954   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3955     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3956     FI = FINode->getIndex();
3957     Bytes = Flags.getByValSize();
3958   } else
3959     return false;
3960
3961   assert(FI != INT_MAX);
3962   if (!MFI.isFixedObjectIndex(FI))
3963     return false;
3964
3965   if (Offset != MFI.getObjectOffset(FI))
3966     return false;
3967
3968   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3969     // If the argument location is wider than the argument type, check that any
3970     // extension flags match.
3971     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3972         Flags.isSExt() != MFI.isObjectSExt(FI)) {
3973       return false;
3974     }
3975   }
3976
3977   return Bytes == MFI.getObjectSize(FI);
3978 }
3979
3980 /// Check whether the call is eligible for tail call optimization. Targets
3981 /// that want to do tail call optimization should implement this function.
3982 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3983     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3984     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3985     const SmallVectorImpl<ISD::OutputArg> &Outs,
3986     const SmallVectorImpl<SDValue> &OutVals,
3987     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3988   if (!mayTailCallThisCC(CalleeCC))
3989     return false;
3990
3991   // If -tailcallopt is specified, make fastcc functions tail-callable.
3992   MachineFunction &MF = DAG.getMachineFunction();
3993   const Function *CallerF = MF.getFunction();
3994
3995   // If the function return type is x86_fp80 and the callee return type is not,
3996   // then the FP_EXTEND of the call result is not a nop. It's not safe to
3997   // perform a tailcall optimization here.
3998   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3999     return false;
4000
4001   CallingConv::ID CallerCC = CallerF->getCallingConv();
4002   bool CCMatch = CallerCC == CalleeCC;
4003   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4004   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4005
4006   // Win64 functions have extra shadow space for argument homing. Don't do the
4007   // sibcall if the caller and callee have mismatched expectations for this
4008   // space.
4009   if (IsCalleeWin64 != IsCallerWin64)
4010     return false;
4011
4012   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4013     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4014       return true;
4015     return false;
4016   }
4017
4018   // Look for obvious safe cases to perform tail call optimization that do not
4019   // require ABI changes. This is what gcc calls sibcall.
4020
4021   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4022   // emit a special epilogue.
4023   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4024   if (RegInfo->needsStackRealignment(MF))
4025     return false;
4026
4027   // Also avoid sibcall optimization if either caller or callee uses struct
4028   // return semantics.
4029   if (isCalleeStructRet || isCallerStructRet)
4030     return false;
4031
4032   // Do not sibcall optimize vararg calls unless all arguments are passed via
4033   // registers.
4034   LLVMContext &C = *DAG.getContext();
4035   if (isVarArg && !Outs.empty()) {
4036     // Optimizing for varargs on Win64 is unlikely to be safe without
4037     // additional testing.
4038     if (IsCalleeWin64 || IsCallerWin64)
4039       return false;
4040
4041     SmallVector<CCValAssign, 16> ArgLocs;
4042     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4043
4044     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4045     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4046       if (!ArgLocs[i].isRegLoc())
4047         return false;
4048   }
4049
4050   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4051   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4052   // this into a sibcall.
4053   bool Unused = false;
4054   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4055     if (!Ins[i].Used) {
4056       Unused = true;
4057       break;
4058     }
4059   }
4060   if (Unused) {
4061     SmallVector<CCValAssign, 16> RVLocs;
4062     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4063     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4064     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4065       CCValAssign &VA = RVLocs[i];
4066       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4067         return false;
4068     }
4069   }
4070
4071   // Check that the call results are passed in the same way.
4072   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4073                                   RetCC_X86, RetCC_X86))
4074     return false;
4075   // The callee has to preserve all registers the caller needs to preserve.
4076   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4077   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4078   if (!CCMatch) {
4079     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4080     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4081       return false;
4082   }
4083
4084   unsigned StackArgsSize = 0;
4085
4086   // If the callee takes no arguments then go on to check the results of the
4087   // call.
4088   if (!Outs.empty()) {
4089     // Check if stack adjustment is needed. For now, do not do this if any
4090     // argument is passed on the stack.
4091     SmallVector<CCValAssign, 16> ArgLocs;
4092     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4093
4094     // Allocate shadow area for Win64
4095     if (IsCalleeWin64)
4096       CCInfo.AllocateStack(32, 8);
4097
4098     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4099     StackArgsSize = CCInfo.getNextStackOffset();
4100
4101     if (CCInfo.getNextStackOffset()) {
4102       // Check if the arguments are already laid out in the right way as
4103       // the caller's fixed stack objects.
4104       MachineFrameInfo &MFI = MF.getFrameInfo();
4105       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4106       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4107       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4108         CCValAssign &VA = ArgLocs[i];
4109         SDValue Arg = OutVals[i];
4110         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4111         if (VA.getLocInfo() == CCValAssign::Indirect)
4112           return false;
4113         if (!VA.isRegLoc()) {
4114           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4115                                    MFI, MRI, TII, VA))
4116             return false;
4117         }
4118       }
4119     }
4120
4121     bool PositionIndependent = isPositionIndependent();
4122     // If the tailcall address may be in a register, then make sure it's
4123     // possible to register allocate for it. In 32-bit, the call address can
4124     // only target EAX, EDX, or ECX since the tail call must be scheduled after
4125     // callee-saved registers are restored. These happen to be the same
4126     // registers used to pass 'inreg' arguments so watch out for those.
4127     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4128                                   !isa<ExternalSymbolSDNode>(Callee)) ||
4129                                  PositionIndependent)) {
4130       unsigned NumInRegs = 0;
4131       // In PIC we need an extra register to formulate the address computation
4132       // for the callee.
4133       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4134
4135       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4136         CCValAssign &VA = ArgLocs[i];
4137         if (!VA.isRegLoc())
4138           continue;
4139         unsigned Reg = VA.getLocReg();
4140         switch (Reg) {
4141         default: break;
4142         case X86::EAX: case X86::EDX: case X86::ECX:
4143           if (++NumInRegs == MaxInRegs)
4144             return false;
4145           break;
4146         }
4147       }
4148     }
4149
4150     const MachineRegisterInfo &MRI = MF.getRegInfo();
4151     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4152       return false;
4153   }
4154
4155   bool CalleeWillPop =
4156       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4157                        MF.getTarget().Options.GuaranteedTailCallOpt);
4158
4159   if (unsigned BytesToPop =
4160           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4161     // If we have bytes to pop, the callee must pop them.
4162     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4163     if (!CalleePopMatches)
4164       return false;
4165   } else if (CalleeWillPop && StackArgsSize > 0) {
4166     // If we don't have bytes to pop, make sure the callee doesn't pop any.
4167     return false;
4168   }
4169
4170   return true;
4171 }
4172
4173 FastISel *
4174 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4175                                   const TargetLibraryInfo *libInfo) const {
4176   return X86::createFastISel(funcInfo, libInfo);
4177 }
4178
4179 //===----------------------------------------------------------------------===//
4180 //                           Other Lowering Hooks
4181 //===----------------------------------------------------------------------===//
4182
4183 static bool MayFoldLoad(SDValue Op) {
4184   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4185 }
4186
4187 static bool MayFoldIntoStore(SDValue Op) {
4188   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4189 }
4190
4191 static bool MayFoldIntoZeroExtend(SDValue Op) {
4192   if (Op.hasOneUse()) {
4193     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4194     return (ISD::ZERO_EXTEND == Opcode);
4195   }
4196   return false;
4197 }
4198
4199 static bool isTargetShuffle(unsigned Opcode) {
4200   switch(Opcode) {
4201   default: return false;
4202   case X86ISD::BLENDI:
4203   case X86ISD::PSHUFB:
4204   case X86ISD::PSHUFD:
4205   case X86ISD::PSHUFHW:
4206   case X86ISD::PSHUFLW:
4207   case X86ISD::SHUFP:
4208   case X86ISD::INSERTPS:
4209   case X86ISD::PALIGNR:
4210   case X86ISD::VSHLDQ:
4211   case X86ISD::VSRLDQ:
4212   case X86ISD::MOVLHPS:
4213   case X86ISD::MOVLHPD:
4214   case X86ISD::MOVHLPS:
4215   case X86ISD::MOVLPS:
4216   case X86ISD::MOVLPD:
4217   case X86ISD::MOVSHDUP:
4218   case X86ISD::MOVSLDUP:
4219   case X86ISD::MOVDDUP:
4220   case X86ISD::MOVSS:
4221   case X86ISD::MOVSD:
4222   case X86ISD::UNPCKL:
4223   case X86ISD::UNPCKH:
4224   case X86ISD::VBROADCAST:
4225   case X86ISD::VPERMILPI:
4226   case X86ISD::VPERMILPV:
4227   case X86ISD::VPERM2X128:
4228   case X86ISD::VPERMIL2:
4229   case X86ISD::VPERMI:
4230   case X86ISD::VPPERM:
4231   case X86ISD::VPERMV:
4232   case X86ISD::VPERMV3:
4233   case X86ISD::VPERMIV3:
4234   case X86ISD::VZEXT_MOVL:
4235     return true;
4236   }
4237 }
4238
4239 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4240   switch (Opcode) {
4241   default: return false;
4242   // Target Shuffles.
4243   case X86ISD::PSHUFB:
4244   case X86ISD::VPERMILPV:
4245   case X86ISD::VPERMIL2:
4246   case X86ISD::VPPERM:
4247   case X86ISD::VPERMV:
4248   case X86ISD::VPERMV3:
4249   case X86ISD::VPERMIV3:
4250     return true;
4251   // 'Faux' Target Shuffles.
4252   case ISD::AND:
4253   case X86ISD::ANDNP:
4254     return true;
4255   }
4256 }
4257
4258 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4259   MachineFunction &MF = DAG.getMachineFunction();
4260   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4261   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4262   int ReturnAddrIndex = FuncInfo->getRAIndex();
4263
4264   if (ReturnAddrIndex == 0) {
4265     // Set up a frame object for the return address.
4266     unsigned SlotSize = RegInfo->getSlotSize();
4267     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4268                                                           -(int64_t)SlotSize,
4269                                                           false);
4270     FuncInfo->setRAIndex(ReturnAddrIndex);
4271   }
4272
4273   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4274 }
4275
4276 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4277                                        bool hasSymbolicDisplacement) {
4278   // Offset should fit into 32 bit immediate field.
4279   if (!isInt<32>(Offset))
4280     return false;
4281
4282   // If we don't have a symbolic displacement - we don't have any extra
4283   // restrictions.
4284   if (!hasSymbolicDisplacement)
4285     return true;
4286
4287   // FIXME: Some tweaks might be needed for medium code model.
4288   if (M != CodeModel::Small && M != CodeModel::Kernel)
4289     return false;
4290
4291   // For small code model we assume that latest object is 16MB before end of 31
4292   // bits boundary. We may also accept pretty large negative constants knowing
4293   // that all objects are in the positive half of address space.
4294   if (M == CodeModel::Small && Offset < 16*1024*1024)
4295     return true;
4296
4297   // For kernel code model we know that all object resist in the negative half
4298   // of 32bits address space. We may not accept negative offsets, since they may
4299   // be just off and we may accept pretty large positive ones.
4300   if (M == CodeModel::Kernel && Offset >= 0)
4301     return true;
4302
4303   return false;
4304 }
4305
4306 /// Determines whether the callee is required to pop its own arguments.
4307 /// Callee pop is necessary to support tail calls.
4308 bool X86::isCalleePop(CallingConv::ID CallingConv,
4309                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4310   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4311   // can guarantee TCO.
4312   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4313     return true;
4314
4315   switch (CallingConv) {
4316   default:
4317     return false;
4318   case CallingConv::X86_StdCall:
4319   case CallingConv::X86_FastCall:
4320   case CallingConv::X86_ThisCall:
4321   case CallingConv::X86_VectorCall:
4322     return !is64Bit;
4323   }
4324 }
4325
4326 /// \brief Return true if the condition is an unsigned comparison operation.
4327 static bool isX86CCUnsigned(unsigned X86CC) {
4328   switch (X86CC) {
4329   default:
4330     llvm_unreachable("Invalid integer condition!");
4331   case X86::COND_E:
4332   case X86::COND_NE:
4333   case X86::COND_B:
4334   case X86::COND_A:
4335   case X86::COND_BE:
4336   case X86::COND_AE:
4337     return true;
4338   case X86::COND_G:
4339   case X86::COND_GE:
4340   case X86::COND_L:
4341   case X86::COND_LE:
4342     return false;
4343   }
4344 }
4345
4346 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4347   switch (SetCCOpcode) {
4348   default: llvm_unreachable("Invalid integer condition!");
4349   case ISD::SETEQ:  return X86::COND_E;
4350   case ISD::SETGT:  return X86::COND_G;
4351   case ISD::SETGE:  return X86::COND_GE;
4352   case ISD::SETLT:  return X86::COND_L;
4353   case ISD::SETLE:  return X86::COND_LE;
4354   case ISD::SETNE:  return X86::COND_NE;
4355   case ISD::SETULT: return X86::COND_B;
4356   case ISD::SETUGT: return X86::COND_A;
4357   case ISD::SETULE: return X86::COND_BE;
4358   case ISD::SETUGE: return X86::COND_AE;
4359   }
4360 }
4361
4362 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4363 /// condition code, returning the condition code and the LHS/RHS of the
4364 /// comparison to make.
4365 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4366                                bool isFP, SDValue &LHS, SDValue &RHS,
4367                                SelectionDAG &DAG) {
4368   if (!isFP) {
4369     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4370       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4371         // X > -1   -> X == 0, jump !sign.
4372         RHS = DAG.getConstant(0, DL, RHS.getValueType());
4373         return X86::COND_NS;
4374       }
4375       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4376         // X < 0   -> X == 0, jump on sign.
4377         return X86::COND_S;
4378       }
4379       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4380         // X < 1   -> X <= 0
4381         RHS = DAG.getConstant(0, DL, RHS.getValueType());
4382         return X86::COND_LE;
4383       }
4384     }
4385
4386     return TranslateIntegerX86CC(SetCCOpcode);
4387   }
4388
4389   // First determine if it is required or is profitable to flip the operands.
4390
4391   // If LHS is a foldable load, but RHS is not, flip the condition.
4392   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4393       !ISD::isNON_EXTLoad(RHS.getNode())) {
4394     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4395     std::swap(LHS, RHS);
4396   }
4397
4398   switch (SetCCOpcode) {
4399   default: break;
4400   case ISD::SETOLT:
4401   case ISD::SETOLE:
4402   case ISD::SETUGT:
4403   case ISD::SETUGE:
4404     std::swap(LHS, RHS);
4405     break;
4406   }
4407
4408   // On a floating point condition, the flags are set as follows:
4409   // ZF  PF  CF   op
4410   //  0 | 0 | 0 | X > Y
4411   //  0 | 0 | 1 | X < Y
4412   //  1 | 0 | 0 | X == Y
4413   //  1 | 1 | 1 | unordered
4414   switch (SetCCOpcode) {
4415   default: llvm_unreachable("Condcode should be pre-legalized away");
4416   case ISD::SETUEQ:
4417   case ISD::SETEQ:   return X86::COND_E;
4418   case ISD::SETOLT:              // flipped
4419   case ISD::SETOGT:
4420   case ISD::SETGT:   return X86::COND_A;
4421   case ISD::SETOLE:              // flipped
4422   case ISD::SETOGE:
4423   case ISD::SETGE:   return X86::COND_AE;
4424   case ISD::SETUGT:              // flipped
4425   case ISD::SETULT:
4426   case ISD::SETLT:   return X86::COND_B;
4427   case ISD::SETUGE:              // flipped
4428   case ISD::SETULE:
4429   case ISD::SETLE:   return X86::COND_BE;
4430   case ISD::SETONE:
4431   case ISD::SETNE:   return X86::COND_NE;
4432   case ISD::SETUO:   return X86::COND_P;
4433   case ISD::SETO:    return X86::COND_NP;
4434   case ISD::SETOEQ:
4435   case ISD::SETUNE:  return X86::COND_INVALID;
4436   }
4437 }
4438
4439 /// Is there a floating point cmov for the specific X86 condition code?
4440 /// Current x86 isa includes the following FP cmov instructions:
4441 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4442 static bool hasFPCMov(unsigned X86CC) {
4443   switch (X86CC) {
4444   default:
4445     return false;
4446   case X86::COND_B:
4447   case X86::COND_BE:
4448   case X86::COND_E:
4449   case X86::COND_P:
4450   case X86::COND_A:
4451   case X86::COND_AE:
4452   case X86::COND_NE:
4453   case X86::COND_NP:
4454     return true;
4455   }
4456 }
4457
4458
4459 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4460                                            const CallInst &I,
4461                                            unsigned Intrinsic) const {
4462
4463   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4464   if (!IntrData)
4465     return false;
4466
4467   Info.opc = ISD::INTRINSIC_W_CHAIN;
4468   Info.readMem = false;
4469   Info.writeMem = false;
4470   Info.vol = false;
4471   Info.offset = 0;
4472
4473   switch (IntrData->Type) {
4474   case EXPAND_FROM_MEM: {
4475     Info.ptrVal = I.getArgOperand(0);
4476     Info.memVT = MVT::getVT(I.getType());
4477     Info.align = 1;
4478     Info.readMem = true;
4479     break;
4480   }
4481   case COMPRESS_TO_MEM: {
4482     Info.ptrVal = I.getArgOperand(0);
4483     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4484     Info.align = 1;
4485     Info.writeMem = true;
4486     break;
4487   }
4488   case TRUNCATE_TO_MEM_VI8:
4489   case TRUNCATE_TO_MEM_VI16:
4490   case TRUNCATE_TO_MEM_VI32: {
4491     Info.ptrVal = I.getArgOperand(0);
4492     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4493     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4494     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4495       ScalarVT = MVT::i8;
4496     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4497       ScalarVT = MVT::i16;
4498     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4499       ScalarVT = MVT::i32;
4500
4501     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4502     Info.align = 1;
4503     Info.writeMem = true;
4504     break;
4505   }
4506   default:
4507     return false;
4508   }
4509
4510   return true;
4511 }
4512
4513 /// Returns true if the target can instruction select the
4514 /// specified FP immediate natively. If false, the legalizer will
4515 /// materialize the FP immediate as a load from a constant pool.
4516 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4517   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4518     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4519       return true;
4520   }
4521   return false;
4522 }
4523
4524 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4525                                               ISD::LoadExtType ExtTy,
4526                                               EVT NewVT) const {
4527   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4528   // relocation target a movq or addq instruction: don't let the load shrink.
4529   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4530   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4531     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4532       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4533   return true;
4534 }
4535
4536 /// \brief Returns true if it is beneficial to convert a load of a constant
4537 /// to just the constant itself.
4538 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4539                                                           Type *Ty) const {
4540   assert(Ty->isIntegerTy());
4541
4542   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4543   if (BitSize == 0 || BitSize > 64)
4544     return false;
4545   return true;
4546 }
4547
4548 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4549                                                 unsigned Index) const {
4550   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4551     return false;
4552
4553   return (Index == 0 || Index == ResVT.getVectorNumElements());
4554 }
4555
4556 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4557   // Speculate cttz only if we can directly use TZCNT.
4558   return Subtarget.hasBMI();
4559 }
4560
4561 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4562   // Speculate ctlz only if we can directly use LZCNT.
4563   return Subtarget.hasLZCNT();
4564 }
4565
4566 bool X86TargetLowering::isCtlzFast() const {
4567   return Subtarget.hasFastLZCNT();
4568 }
4569
4570 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4571     const Instruction &AndI) const {
4572   return true;
4573 }
4574
4575 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4576   if (!Subtarget.hasBMI())
4577     return false;
4578
4579   // There are only 32-bit and 64-bit forms for 'andn'.
4580   EVT VT = Y.getValueType();
4581   if (VT != MVT::i32 && VT != MVT::i64)
4582     return false;
4583
4584   return true;
4585 }
4586
4587 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4588   MVT VT = MVT::getIntegerVT(NumBits);
4589   if (isTypeLegal(VT))
4590     return VT;
4591
4592   // PMOVMSKB can handle this.
4593   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4594     return MVT::v16i8;
4595
4596   // VPMOVMSKB can handle this.
4597   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4598     return MVT::v32i8;
4599
4600   // TODO: Allow 64-bit type for 32-bit target.
4601   // TODO: 512-bit types should be allowed, but make sure that those
4602   // cases are handled in combineVectorSizedSetCCEquality().
4603
4604   return MVT::INVALID_SIMPLE_VALUE_TYPE;
4605 }
4606
4607 /// Val is the undef sentinel value or equal to the specified value.
4608 static bool isUndefOrEqual(int Val, int CmpVal) {
4609   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4610 }
4611
4612 /// Val is either the undef or zero sentinel value.
4613 static bool isUndefOrZero(int Val) {
4614   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4615 }
4616
4617 /// Return true if every element in Mask, beginning
4618 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4619 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4620   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4621     if (Mask[i] != SM_SentinelUndef)
4622       return false;
4623   return true;
4624 }
4625
4626 /// Return true if Val is undef or if its value falls within the
4627 /// specified range (L, H].
4628 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4629   return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4630 }
4631
4632 /// Return true if every element in Mask is undef or if its value
4633 /// falls within the specified range (L, H].
4634 static bool isUndefOrInRange(ArrayRef<int> Mask,
4635                              int Low, int Hi) {
4636   for (int M : Mask)
4637     if (!isUndefOrInRange(M, Low, Hi))
4638       return false;
4639   return true;
4640 }
4641
4642 /// Return true if Val is undef, zero or if its value falls within the
4643 /// specified range (L, H].
4644 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4645   return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4646 }
4647
4648 /// Return true if every element in Mask is undef, zero or if its value
4649 /// falls within the specified range (L, H].
4650 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4651   for (int M : Mask)
4652     if (!isUndefOrZeroOrInRange(M, Low, Hi))
4653       return false;
4654   return true;
4655 }
4656
4657 /// Return true if every element in Mask, beginning
4658 /// from position Pos and ending in Pos+Size, falls within the specified
4659 /// sequential range (Low, Low+Size]. or is undef.
4660 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4661                                        unsigned Pos, unsigned Size, int Low) {
4662   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4663     if (!isUndefOrEqual(Mask[i], Low))
4664       return false;
4665   return true;
4666 }
4667
4668 /// Return true if every element in Mask, beginning
4669 /// from position Pos and ending in Pos+Size, falls within the specified
4670 /// sequential range (Low, Low+Size], or is undef or is zero.
4671 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4672                                              unsigned Size, int Low) {
4673   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4674     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4675       return false;
4676   return true;
4677 }
4678
4679 /// Return true if every element in Mask, beginning
4680 /// from position Pos and ending in Pos+Size is undef or is zero.
4681 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4682                                  unsigned Size) {
4683   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4684     if (!isUndefOrZero(Mask[i]))
4685       return false;
4686   return true;
4687 }
4688
4689 /// \brief Helper function to test whether a shuffle mask could be
4690 /// simplified by widening the elements being shuffled.
4691 ///
4692 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4693 /// leaves it in an unspecified state.
4694 ///
4695 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4696 /// shuffle masks. The latter have the special property of a '-2' representing
4697 /// a zero-ed lane of a vector.
4698 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4699                                     SmallVectorImpl<int> &WidenedMask) {
4700   WidenedMask.assign(Mask.size() / 2, 0);
4701   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4702     int M0 = Mask[i];
4703     int M1 = Mask[i + 1];
4704
4705     // If both elements are undef, its trivial.
4706     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4707       WidenedMask[i / 2] = SM_SentinelUndef;
4708       continue;
4709     }
4710
4711     // Check for an undef mask and a mask value properly aligned to fit with
4712     // a pair of values. If we find such a case, use the non-undef mask's value.
4713     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4714       WidenedMask[i / 2] = M1 / 2;
4715       continue;
4716     }
4717     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4718       WidenedMask[i / 2] = M0 / 2;
4719       continue;
4720     }
4721
4722     // When zeroing, we need to spread the zeroing across both lanes to widen.
4723     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4724       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4725           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4726         WidenedMask[i / 2] = SM_SentinelZero;
4727         continue;
4728       }
4729       return false;
4730     }
4731
4732     // Finally check if the two mask values are adjacent and aligned with
4733     // a pair.
4734     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4735       WidenedMask[i / 2] = M0 / 2;
4736       continue;
4737     }
4738
4739     // Otherwise we can't safely widen the elements used in this shuffle.
4740     return false;
4741   }
4742   assert(WidenedMask.size() == Mask.size() / 2 &&
4743          "Incorrect size of mask after widening the elements!");
4744
4745   return true;
4746 }
4747
4748 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4749 /// mask index with the scaled sequential indices for an equivalent narrowed
4750 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4751 /// succeed.
4752 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4753                              SmallVectorImpl<int> &ScaledMask) {
4754   assert(0 < Scale && "Unexpected scaling factor");
4755   int NumElts = Mask.size();
4756   ScaledMask.assign(NumElts * Scale, -1);
4757
4758   for (int i = 0; i != NumElts; ++i) {
4759     int M = Mask[i];
4760
4761     // Repeat sentinel values in every mask element.
4762     if (M < 0) {
4763       for (int s = 0; s != Scale; ++s)
4764         ScaledMask[(Scale * i) + s] = M;
4765       continue;
4766     }
4767
4768     // Scale mask element and increment across each mask element.
4769     for (int s = 0; s != Scale; ++s)
4770       ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4771   }
4772 }
4773
4774 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4775 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4776 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4777   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4778   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4779     return false;
4780
4781   // The index should be aligned on a vecWidth-bit boundary.
4782   uint64_t Index = N->getConstantOperandVal(1);
4783   MVT VT = N->getSimpleValueType(0);
4784   unsigned ElSize = VT.getScalarSizeInBits();
4785   return (Index * ElSize) % vecWidth == 0;
4786 }
4787
4788 /// Return true if the specified INSERT_SUBVECTOR
4789 /// operand specifies a subvector insert that is suitable for input to
4790 /// insertion of 128 or 256-bit subvectors
4791 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4792   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4793   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4794     return false;
4795
4796   // The index should be aligned on a vecWidth-bit boundary.
4797   uint64_t Index = N->getConstantOperandVal(2);
4798   MVT VT = N->getSimpleValueType(0);
4799   unsigned ElSize = VT.getScalarSizeInBits();
4800   return (Index * ElSize) % vecWidth == 0;
4801 }
4802
4803 bool X86::isVINSERT128Index(SDNode *N) {
4804   return isVINSERTIndex(N, 128);
4805 }
4806
4807 bool X86::isVINSERT256Index(SDNode *N) {
4808   return isVINSERTIndex(N, 256);
4809 }
4810
4811 bool X86::isVEXTRACT128Index(SDNode *N) {
4812   return isVEXTRACTIndex(N, 128);
4813 }
4814
4815 bool X86::isVEXTRACT256Index(SDNode *N) {
4816   return isVEXTRACTIndex(N, 256);
4817 }
4818
4819 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4820   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4821   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4822          "Illegal extract subvector for VEXTRACT");
4823
4824   uint64_t Index = N->getConstantOperandVal(1);
4825   MVT VecVT = N->getOperand(0).getSimpleValueType();
4826   unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4827   return Index / NumElemsPerChunk;
4828 }
4829
4830 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4831   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4832   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4833          "Illegal insert subvector for VINSERT");
4834
4835   uint64_t Index = N->getConstantOperandVal(2);
4836   MVT VecVT = N->getSimpleValueType(0);
4837   unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4838   return Index / NumElemsPerChunk;
4839 }
4840
4841 /// Return the appropriate immediate to extract the specified
4842 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4843 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4844   return getExtractVEXTRACTImmediate(N, 128);
4845 }
4846
4847 /// Return the appropriate immediate to extract the specified
4848 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4849 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4850   return getExtractVEXTRACTImmediate(N, 256);
4851 }
4852
4853 /// Return the appropriate immediate to insert at the specified
4854 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4855 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4856   return getInsertVINSERTImmediate(N, 128);
4857 }
4858
4859 /// Return the appropriate immediate to insert at the specified
4860 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4861 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4862   return getInsertVINSERTImmediate(N, 256);
4863 }
4864
4865 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4866 bool X86::isZeroNode(SDValue Elt) {
4867   return isNullConstant(Elt) || isNullFPConstant(Elt);
4868 }
4869
4870 // Build a vector of constants.
4871 // Use an UNDEF node if MaskElt == -1.
4872 // Split 64-bit constants in the 32-bit mode.
4873 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4874                               const SDLoc &dl, bool IsMask = false) {
4875
4876   SmallVector<SDValue, 32>  Ops;
4877   bool Split = false;
4878
4879   MVT ConstVecVT = VT;
4880   unsigned NumElts = VT.getVectorNumElements();
4881   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4882   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4883     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4884     Split = true;
4885   }
4886
4887   MVT EltVT = ConstVecVT.getVectorElementType();
4888   for (unsigned i = 0; i < NumElts; ++i) {
4889     bool IsUndef = Values[i] < 0 && IsMask;
4890     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4891       DAG.getConstant(Values[i], dl, EltVT);
4892     Ops.push_back(OpNode);
4893     if (Split)
4894       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4895                     DAG.getConstant(0, dl, EltVT));
4896   }
4897   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4898   if (Split)
4899     ConstsNode = DAG.getBitcast(VT, ConstsNode);
4900   return ConstsNode;
4901 }
4902
4903 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4904                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4905   assert(Bits.size() == Undefs.getBitWidth() &&
4906          "Unequal constant and undef arrays");
4907   SmallVector<SDValue, 32> Ops;
4908   bool Split = false;
4909
4910   MVT ConstVecVT = VT;
4911   unsigned NumElts = VT.getVectorNumElements();
4912   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4913   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4914     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4915     Split = true;
4916   }
4917
4918   MVT EltVT = ConstVecVT.getVectorElementType();
4919   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4920     if (Undefs[i]) {
4921       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4922       continue;
4923     }
4924     const APInt &V = Bits[i];
4925     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4926     if (Split) {
4927       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4928       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4929     } else if (EltVT == MVT::f32) {
4930       APFloat FV(APFloat::IEEEsingle(), V);
4931       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4932     } else if (EltVT == MVT::f64) {
4933       APFloat FV(APFloat::IEEEdouble(), V);
4934       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4935     } else {
4936       Ops.push_back(DAG.getConstant(V, dl, EltVT));
4937     }
4938   }
4939
4940   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4941   return DAG.getBitcast(VT, ConstsNode);
4942 }
4943
4944 /// Returns a vector of specified type with all zero elements.
4945 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4946                              SelectionDAG &DAG, const SDLoc &dl) {
4947   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4948           VT.getVectorElementType() == MVT::i1) &&
4949          "Unexpected vector type");
4950
4951   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4952   // type. This ensures they get CSE'd. But if the integer type is not
4953   // available, use a floating-point +0.0 instead.
4954   SDValue Vec;
4955   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4956     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4957   } else if (VT.getVectorElementType() == MVT::i1) {
4958     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4959            "Unexpected vector type");
4960     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4961            "Unexpected vector type");
4962     Vec = DAG.getConstant(0, dl, VT);
4963   } else {
4964     unsigned Num32BitElts = VT.getSizeInBits() / 32;
4965     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4966   }
4967   return DAG.getBitcast(VT, Vec);
4968 }
4969
4970 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4971                                 const SDLoc &dl, unsigned vectorWidth) {
4972   EVT VT = Vec.getValueType();
4973   EVT ElVT = VT.getVectorElementType();
4974   unsigned Factor = VT.getSizeInBits()/vectorWidth;
4975   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4976                                   VT.getVectorNumElements()/Factor);
4977
4978   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
4979   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4980   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4981
4982   // This is the index of the first element of the vectorWidth-bit chunk
4983   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4984   IdxVal &= ~(ElemsPerChunk - 1);
4985
4986   // If the input is a buildvector just emit a smaller one.
4987   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4988     return DAG.getBuildVector(
4989         ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4990
4991   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4992   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4993 }
4994
4995 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
4996 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4997 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4998 /// instructions or a simple subregister reference. Idx is an index in the
4999 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5000 /// lowering EXTRACT_VECTOR_ELT operations easier.
5001 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5002                                    SelectionDAG &DAG, const SDLoc &dl) {
5003   assert((Vec.getValueType().is256BitVector() ||
5004           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5005   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5006 }
5007
5008 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5009 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5010                                    SelectionDAG &DAG, const SDLoc &dl) {
5011   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5012   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5013 }
5014
5015 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5016                                SelectionDAG &DAG, const SDLoc &dl,
5017                                unsigned vectorWidth) {
5018   assert((vectorWidth == 128 || vectorWidth == 256) &&
5019          "Unsupported vector width");
5020   // Inserting UNDEF is Result
5021   if (Vec.isUndef())
5022     return Result;
5023   EVT VT = Vec.getValueType();
5024   EVT ElVT = VT.getVectorElementType();
5025   EVT ResultVT = Result.getValueType();
5026
5027   // Insert the relevant vectorWidth bits.
5028   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5029   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5030
5031   // This is the index of the first element of the vectorWidth-bit chunk
5032   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5033   IdxVal &= ~(ElemsPerChunk - 1);
5034
5035   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5036   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5037 }
5038
5039 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5040 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5041 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5042 /// simple superregister reference.  Idx is an index in the 128 bits
5043 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
5044 /// lowering INSERT_VECTOR_ELT operations easier.
5045 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5046                                   SelectionDAG &DAG, const SDLoc &dl) {
5047   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5048   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5049 }
5050
5051 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5052                                   SelectionDAG &DAG, const SDLoc &dl) {
5053   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5054   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5055 }
5056
5057 /// Insert i1-subvector to i1-vector.
5058 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5059                                 const X86Subtarget &Subtarget) {
5060
5061   SDLoc dl(Op);
5062   SDValue Vec = Op.getOperand(0);
5063   SDValue SubVec = Op.getOperand(1);
5064   SDValue Idx = Op.getOperand(2);
5065
5066   if (!isa<ConstantSDNode>(Idx))
5067     return SDValue();
5068
5069   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5070   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
5071     return Op;
5072
5073   MVT OpVT = Op.getSimpleValueType();
5074   MVT SubVecVT = SubVec.getSimpleValueType();
5075   unsigned NumElems = OpVT.getVectorNumElements();
5076   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5077
5078   assert(IdxVal + SubVecNumElems <= NumElems &&
5079          IdxVal % SubVecVT.getSizeInBits() == 0 &&
5080          "Unexpected index value in INSERT_SUBVECTOR");
5081
5082   // There are 3 possible cases:
5083   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5084   // 2. Subvector should be inserted in the upper part
5085   //    (IdxVal + SubVecNumElems == NumElems)
5086   // 3. Subvector should be inserted in the middle (for example v2i1
5087   //    to v16i1, index 2)
5088
5089   // extend to natively supported kshift
5090   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5091   MVT WideOpVT = OpVT;
5092   if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5093     WideOpVT = MinVT;
5094
5095   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5096   SDValue Undef = DAG.getUNDEF(WideOpVT);
5097   SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5098                                    Undef, SubVec, ZeroIdx);
5099
5100   // Extract sub-vector if require.
5101   auto ExtractSubVec = [&](SDValue V) {
5102     return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5103                                                 OpVT, V, ZeroIdx);
5104   };
5105
5106   if (Vec.isUndef()) {
5107     if (IdxVal != 0) {
5108       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5109       WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5110                                ShiftBits);
5111     }
5112     return ExtractSubVec(WideSubVec);
5113   }
5114
5115   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5116     NumElems = WideOpVT.getVectorNumElements();
5117     unsigned ShiftLeft = NumElems - SubVecNumElems;
5118     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5119     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5120                       DAG.getConstant(ShiftLeft, dl, MVT::i8));
5121     Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5122       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5123     return ExtractSubVec(Vec);
5124   }
5125
5126   if (IdxVal == 0) {
5127     // Zero lower bits of the Vec
5128     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5129     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5130     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5131     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5132     // Merge them together, SubVec should be zero extended.
5133     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5134                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
5135                              SubVec, ZeroIdx);
5136     Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5137     return ExtractSubVec(Vec);
5138   }
5139
5140   // Simple case when we put subvector in the upper part
5141   if (IdxVal + SubVecNumElems == NumElems) {
5142     // Zero upper bits of the Vec
5143     WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5144                              DAG.getConstant(IdxVal, dl, MVT::i8));
5145     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5146     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5147     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5148     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5149     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5150     return ExtractSubVec(Vec);
5151   }
5152   // Subvector should be inserted in the middle - use shuffle
5153   WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5154                            SubVec, ZeroIdx);
5155   SmallVector<int, 64> Mask;
5156   for (unsigned i = 0; i < NumElems; ++i)
5157     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5158                     i : i + NumElems);
5159   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5160 }
5161
5162 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5163 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5164 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5165 /// large BUILD_VECTORS.
5166 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5167                                    unsigned NumElems, SelectionDAG &DAG,
5168                                    const SDLoc &dl) {
5169   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5170   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5171 }
5172
5173 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5174                                    unsigned NumElems, SelectionDAG &DAG,
5175                                    const SDLoc &dl) {
5176   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5177   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5178 }
5179
5180 /// Returns a vector of specified type with all bits set.
5181 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5182 /// Then bitcast to their original type, ensuring they get CSE'd.
5183 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5184   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5185          "Expected a 128/256/512-bit vector type");
5186
5187   APInt Ones = APInt::getAllOnesValue(32);
5188   unsigned NumElts = VT.getSizeInBits() / 32;
5189   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5190   return DAG.getBitcast(VT, Vec);
5191 }
5192
5193 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5194                               SelectionDAG &DAG) {
5195   EVT InVT = In.getValueType();
5196   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5197
5198   if (VT.is128BitVector() && InVT.is128BitVector())
5199     return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5200                                 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5201
5202   // For 256-bit vectors, we only need the lower (128-bit) input half.
5203   // For 512-bit vectors, we only need the lower input half or quarter.
5204   if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5205     int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5206     In = extractSubVector(In, 0, DAG, DL,
5207                           std::max(128, (int)VT.getSizeInBits() / Scale));
5208   }
5209
5210   return DAG.getNode(Opc, DL, VT, In);
5211 }
5212
5213 /// Generate unpacklo/unpackhi shuffle mask.
5214 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5215                                     bool Unary) {
5216   assert(Mask.empty() && "Expected an empty shuffle mask vector");
5217   int NumElts = VT.getVectorNumElements();
5218   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5219
5220   for (int i = 0; i < NumElts; ++i) {
5221     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5222     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5223     Pos += (Unary ? 0 : NumElts * (i % 2));
5224     Pos += (Lo ? 0 : NumEltsInLane / 2);
5225     Mask.push_back(Pos);
5226   }
5227 }
5228
5229 /// Returns a vector_shuffle node for an unpackl operation.
5230 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5231                           SDValue V1, SDValue V2) {
5232   SmallVector<int, 8> Mask;
5233   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5234   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5235 }
5236
5237 /// Returns a vector_shuffle node for an unpackh operation.
5238 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5239                           SDValue V1, SDValue V2) {
5240   SmallVector<int, 8> Mask;
5241   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5242   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5243 }
5244
5245 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5246 /// This produces a shuffle where the low element of V2 is swizzled into the
5247 /// zero/undef vector, landing at element Idx.
5248 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5249 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5250                                            bool IsZero,
5251                                            const X86Subtarget &Subtarget,
5252                                            SelectionDAG &DAG) {
5253   MVT VT = V2.getSimpleValueType();
5254   SDValue V1 = IsZero
5255     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5256   int NumElems = VT.getVectorNumElements();
5257   SmallVector<int, 16> MaskVec(NumElems);
5258   for (int i = 0; i != NumElems; ++i)
5259     // If this is the insertion idx, put the low elt of V2 here.
5260     MaskVec[i] = (i == Idx) ? NumElems : i;
5261   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5262 }
5263
5264 static SDValue peekThroughBitcasts(SDValue V) {
5265   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5266     V = V.getOperand(0);
5267   return V;
5268 }
5269
5270 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5271   while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5272          V.getOperand(0).hasOneUse())
5273     V = V.getOperand(0);
5274   return V;
5275 }
5276
5277 static const Constant *getTargetConstantFromNode(SDValue Op) {
5278   Op = peekThroughBitcasts(Op);
5279
5280   auto *Load = dyn_cast<LoadSDNode>(Op);
5281   if (!Load)
5282     return nullptr;
5283
5284   SDValue Ptr = Load->getBasePtr();
5285   if (Ptr->getOpcode() == X86ISD::Wrapper ||
5286       Ptr->getOpcode() == X86ISD::WrapperRIP)
5287     Ptr = Ptr->getOperand(0);
5288
5289   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5290   if (!CNode || CNode->isMachineConstantPoolEntry())
5291     return nullptr;
5292
5293   return dyn_cast<Constant>(CNode->getConstVal());
5294 }
5295
5296 // Extract raw constant bits from constant pools.
5297 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5298                                           APInt &UndefElts,
5299                                           SmallVectorImpl<APInt> &EltBits,
5300                                           bool AllowWholeUndefs = true,
5301                                           bool AllowPartialUndefs = true) {
5302   assert(EltBits.empty() && "Expected an empty EltBits vector");
5303
5304   Op = peekThroughBitcasts(Op);
5305
5306   EVT VT = Op.getValueType();
5307   unsigned SizeInBits = VT.getSizeInBits();
5308   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5309   unsigned NumElts = SizeInBits / EltSizeInBits;
5310
5311   unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5312   unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5313
5314   // Extract all the undef/constant element data and pack into single bitsets.
5315   APInt UndefBits(SizeInBits, 0);
5316   APInt MaskBits(SizeInBits, 0);
5317
5318   // Split the undef/constant single bitset data into the target elements.
5319   auto SplitBitData = [&]() {
5320     // Don't split if we don't allow undef bits.
5321     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5322     if (UndefBits.getBoolValue() && !AllowUndefs)
5323       return false;
5324
5325     UndefElts = APInt(NumElts, 0);
5326     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5327
5328     for (unsigned i = 0; i != NumElts; ++i) {
5329       unsigned BitOffset = i * EltSizeInBits;
5330       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5331
5332       // Only treat an element as UNDEF if all bits are UNDEF.
5333       if (UndefEltBits.isAllOnesValue()) {
5334         if (!AllowWholeUndefs)
5335           return false;
5336         UndefElts.setBit(i);
5337         continue;
5338       }
5339
5340       // If only some bits are UNDEF then treat them as zero (or bail if not
5341       // supported).
5342       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5343         return false;
5344
5345       APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5346       EltBits[i] = Bits.getZExtValue();
5347     }
5348     return true;
5349   };
5350
5351   // Collect constant bits and insert into mask/undef bit masks.
5352   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5353                                 unsigned BitOffset) {
5354     if (!Cst)
5355       return false;
5356     if (isa<UndefValue>(Cst)) {
5357       unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5358       Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
5359       return true;
5360     }
5361     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5362       Mask.insertBits(CInt->getValue(), BitOffset);
5363       return true;
5364     }
5365     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5366       Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
5367       return true;
5368     }
5369     return false;
5370   };
5371
5372   // Extract constant bits from build vector.
5373   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5374     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5375       const SDValue &Src = Op.getOperand(i);
5376       unsigned BitOffset = i * SrcEltSizeInBits;
5377       if (Src.isUndef()) {
5378         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5379         continue;
5380       }
5381       auto *Cst = cast<ConstantSDNode>(Src);
5382       APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5383       MaskBits.insertBits(Bits, BitOffset);
5384     }
5385     return SplitBitData();
5386   }
5387
5388   // Extract constant bits from constant pool vector.
5389   if (auto *Cst = getTargetConstantFromNode(Op)) {
5390     Type *CstTy = Cst->getType();
5391     if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5392       return false;
5393
5394     unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5395     for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
5396       if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
5397                                i * CstEltSizeInBits))
5398         return false;
5399
5400     return SplitBitData();
5401   }
5402
5403   // Extract constant bits from a broadcasted constant pool scalar.
5404   if (Op.getOpcode() == X86ISD::VBROADCAST &&
5405       EltSizeInBits <= SrcEltSizeInBits) {
5406     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5407       APInt Bits(SizeInBits, 0);
5408       APInt Undefs(SizeInBits, 0);
5409       if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
5410         for (unsigned i = 0; i != NumSrcElts; ++i) {
5411           MaskBits |= Bits.shl(i * SrcEltSizeInBits);
5412           UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
5413         }
5414         return SplitBitData();
5415       }
5416     }
5417   }
5418
5419   // Extract a rematerialized scalar constant insertion.
5420   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5421       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5422       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5423     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5424     MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5425     MaskBits = MaskBits.zext(SizeInBits);
5426     return SplitBitData();
5427   }
5428
5429   return false;
5430 }
5431
5432 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5433                                         unsigned MaskEltSizeInBits,
5434                                         SmallVectorImpl<uint64_t> &RawMask) {
5435   APInt UndefElts;
5436   SmallVector<APInt, 64> EltBits;
5437
5438   // Extract the raw target constant bits.
5439   // FIXME: We currently don't support UNDEF bits or mask entries.
5440   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5441                                      EltBits, /* AllowWholeUndefs */ false,
5442                                      /* AllowPartialUndefs */ false))
5443     return false;
5444
5445   // Insert the extracted elements into the mask.
5446   for (APInt Elt : EltBits)
5447     RawMask.push_back(Elt.getZExtValue());
5448
5449   return true;
5450 }
5451
5452 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5453 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5454 /// operands in \p Ops, and returns true.
5455 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5456 /// IsUnary for shuffles which use a single input multiple times, and in those
5457 /// cases it will adjust the mask to only have indices within that single input.
5458 /// It is an error to call this with non-empty Mask/Ops vectors.
5459 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5460                                  SmallVectorImpl<SDValue> &Ops,
5461                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5462   unsigned NumElems = VT.getVectorNumElements();
5463   SDValue ImmN;
5464
5465   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5466   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5467
5468   IsUnary = false;
5469   bool IsFakeUnary = false;
5470   switch(N->getOpcode()) {
5471   case X86ISD::BLENDI:
5472     ImmN = N->getOperand(N->getNumOperands()-1);
5473     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5474     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5475     break;
5476   case X86ISD::SHUFP:
5477     ImmN = N->getOperand(N->getNumOperands()-1);
5478     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5479     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5480     break;
5481   case X86ISD::INSERTPS:
5482     ImmN = N->getOperand(N->getNumOperands()-1);
5483     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5484     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5485     break;
5486   case X86ISD::UNPCKH:
5487     DecodeUNPCKHMask(VT, Mask);
5488     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5489     break;
5490   case X86ISD::UNPCKL:
5491     DecodeUNPCKLMask(VT, Mask);
5492     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5493     break;
5494   case X86ISD::MOVHLPS:
5495     DecodeMOVHLPSMask(NumElems, Mask);
5496     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5497     break;
5498   case X86ISD::MOVLHPS:
5499     DecodeMOVLHPSMask(NumElems, Mask);
5500     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5501     break;
5502   case X86ISD::PALIGNR:
5503     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5504     ImmN = N->getOperand(N->getNumOperands()-1);
5505     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5506     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5507     Ops.push_back(N->getOperand(1));
5508     Ops.push_back(N->getOperand(0));
5509     break;
5510   case X86ISD::VSHLDQ:
5511     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5512     ImmN = N->getOperand(N->getNumOperands() - 1);
5513     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5514     IsUnary = true;
5515     break;
5516   case X86ISD::VSRLDQ:
5517     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5518     ImmN = N->getOperand(N->getNumOperands() - 1);
5519     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5520     IsUnary = true;
5521     break;
5522   case X86ISD::PSHUFD:
5523   case X86ISD::VPERMILPI:
5524     ImmN = N->getOperand(N->getNumOperands()-1);
5525     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5526     IsUnary = true;
5527     break;
5528   case X86ISD::PSHUFHW:
5529     ImmN = N->getOperand(N->getNumOperands()-1);
5530     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5531     IsUnary = true;
5532     break;
5533   case X86ISD::PSHUFLW:
5534     ImmN = N->getOperand(N->getNumOperands()-1);
5535     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5536     IsUnary = true;
5537     break;
5538   case X86ISD::VZEXT_MOVL:
5539     DecodeZeroMoveLowMask(VT, Mask);
5540     IsUnary = true;
5541     break;
5542   case X86ISD::VBROADCAST: {
5543     SDValue N0 = N->getOperand(0);
5544     // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5545     // add the pre-extracted value to the Ops vector.
5546     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5547         N0.getOperand(0).getValueType() == VT &&
5548         N0.getConstantOperandVal(1) == 0)
5549       Ops.push_back(N0.getOperand(0));
5550
5551     // We only decode broadcasts of same-sized vectors, unless the broadcast
5552     // came from an extract from the original width. If we found one, we
5553     // pushed it the Ops vector above.
5554     if (N0.getValueType() == VT || !Ops.empty()) {
5555       DecodeVectorBroadcast(VT, Mask);
5556       IsUnary = true;
5557       break;
5558     }
5559     return false;
5560   }
5561   case X86ISD::VPERMILPV: {
5562     IsUnary = true;
5563     SDValue MaskNode = N->getOperand(1);
5564     unsigned MaskEltSize = VT.getScalarSizeInBits();
5565     SmallVector<uint64_t, 32> RawMask;
5566     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5567       DecodeVPERMILPMask(VT, RawMask, Mask);
5568       break;
5569     }
5570     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5571       DecodeVPERMILPMask(C, MaskEltSize, Mask);
5572       break;
5573     }
5574     return false;
5575   }
5576   case X86ISD::PSHUFB: {
5577     IsUnary = true;
5578     SDValue MaskNode = N->getOperand(1);
5579     SmallVector<uint64_t, 32> RawMask;
5580     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5581       DecodePSHUFBMask(RawMask, Mask);
5582       break;
5583     }
5584     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5585       DecodePSHUFBMask(C, Mask);
5586       break;
5587     }
5588     return false;
5589   }
5590   case X86ISD::VPERMI:
5591     ImmN = N->getOperand(N->getNumOperands()-1);
5592     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5593     IsUnary = true;
5594     break;
5595   case X86ISD::MOVSS:
5596   case X86ISD::MOVSD:
5597     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5598     break;
5599   case X86ISD::VPERM2X128:
5600     ImmN = N->getOperand(N->getNumOperands()-1);
5601     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5602     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5603     break;
5604   case X86ISD::MOVSLDUP:
5605     DecodeMOVSLDUPMask(VT, Mask);
5606     IsUnary = true;
5607     break;
5608   case X86ISD::MOVSHDUP:
5609     DecodeMOVSHDUPMask(VT, Mask);
5610     IsUnary = true;
5611     break;
5612   case X86ISD::MOVDDUP:
5613     DecodeMOVDDUPMask(VT, Mask);
5614     IsUnary = true;
5615     break;
5616   case X86ISD::MOVLHPD:
5617   case X86ISD::MOVLPD:
5618   case X86ISD::MOVLPS:
5619     // Not yet implemented
5620     return false;
5621   case X86ISD::VPERMIL2: {
5622     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5623     unsigned MaskEltSize = VT.getScalarSizeInBits();
5624     SDValue MaskNode = N->getOperand(2);
5625     SDValue CtrlNode = N->getOperand(3);
5626     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5627       unsigned CtrlImm = CtrlOp->getZExtValue();
5628       SmallVector<uint64_t, 32> RawMask;
5629       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5630         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5631         break;
5632       }
5633       if (auto *C = getTargetConstantFromNode(MaskNode)) {
5634         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5635         break;
5636       }
5637     }
5638     return false;
5639   }
5640   case X86ISD::VPPERM: {
5641     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5642     SDValue MaskNode = N->getOperand(2);
5643     SmallVector<uint64_t, 32> RawMask;
5644     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5645       DecodeVPPERMMask(RawMask, Mask);
5646       break;
5647     }
5648     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5649       DecodeVPPERMMask(C, Mask);
5650       break;
5651     }
5652     return false;
5653   }
5654   case X86ISD::VPERMV: {
5655     IsUnary = true;
5656     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5657     Ops.push_back(N->getOperand(1));
5658     SDValue MaskNode = N->getOperand(0);
5659     SmallVector<uint64_t, 32> RawMask;
5660     unsigned MaskEltSize = VT.getScalarSizeInBits();
5661     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5662       DecodeVPERMVMask(RawMask, Mask);
5663       break;
5664     }
5665     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5666       DecodeVPERMVMask(C, MaskEltSize, Mask);
5667       break;
5668     }
5669     return false;
5670   }
5671   case X86ISD::VPERMV3: {
5672     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5673     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5674     Ops.push_back(N->getOperand(0));
5675     Ops.push_back(N->getOperand(2));
5676     SDValue MaskNode = N->getOperand(1);
5677     unsigned MaskEltSize = VT.getScalarSizeInBits();
5678     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5679       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5680       break;
5681     }
5682     return false;
5683   }
5684   case X86ISD::VPERMIV3: {
5685     IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5686     // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5687     Ops.push_back(N->getOperand(1));
5688     Ops.push_back(N->getOperand(2));
5689     SDValue MaskNode = N->getOperand(0);
5690     unsigned MaskEltSize = VT.getScalarSizeInBits();
5691     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5692       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5693       break;
5694     }
5695     return false;
5696   }
5697   default: llvm_unreachable("unknown target shuffle node");
5698   }
5699
5700   // Empty mask indicates the decode failed.
5701   if (Mask.empty())
5702     return false;
5703
5704   // Check if we're getting a shuffle mask with zero'd elements.
5705   if (!AllowSentinelZero)
5706     if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5707       return false;
5708
5709   // If we have a fake unary shuffle, the shuffle mask is spread across two
5710   // inputs that are actually the same node. Re-map the mask to always point
5711   // into the first input.
5712   if (IsFakeUnary)
5713     for (int &M : Mask)
5714       if (M >= (int)Mask.size())
5715         M -= Mask.size();
5716
5717   // If we didn't already add operands in the opcode-specific code, default to
5718   // adding 1 or 2 operands starting at 0.
5719   if (Ops.empty()) {
5720     Ops.push_back(N->getOperand(0));
5721     if (!IsUnary || IsFakeUnary)
5722       Ops.push_back(N->getOperand(1));
5723   }
5724
5725   return true;
5726 }
5727
5728 /// Check a target shuffle mask's inputs to see if we can set any values to
5729 /// SM_SentinelZero - this is for elements that are known to be zero
5730 /// (not just zeroable) from their inputs.
5731 /// Returns true if the target shuffle mask was decoded.
5732 static bool setTargetShuffleZeroElements(SDValue N,
5733                                          SmallVectorImpl<int> &Mask,
5734                                          SmallVectorImpl<SDValue> &Ops) {
5735   bool IsUnary;
5736   if (!isTargetShuffle(N.getOpcode()))
5737     return false;
5738
5739   MVT VT = N.getSimpleValueType();
5740   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5741     return false;
5742
5743   SDValue V1 = Ops[0];
5744   SDValue V2 = IsUnary ? V1 : Ops[1];
5745
5746   V1 = peekThroughBitcasts(V1);
5747   V2 = peekThroughBitcasts(V2);
5748
5749   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5750          "Illegal split of shuffle value type");
5751   unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5752
5753   // Extract known constant input data.
5754   APInt UndefSrcElts[2];
5755   SmallVector<APInt, 32> SrcEltBits[2];
5756   bool IsSrcConstant[2] = {
5757       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5758                                     SrcEltBits[0], true, false),
5759       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5760                                     SrcEltBits[1], true, false)};
5761
5762   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5763     int M = Mask[i];
5764
5765     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5766     if (M < 0)
5767       continue;
5768
5769     // Determine shuffle input and normalize the mask.
5770     unsigned SrcIdx = M / Size;
5771     SDValue V = M < Size ? V1 : V2;
5772     M %= Size;
5773
5774     // We are referencing an UNDEF input.
5775     if (V.isUndef()) {
5776       Mask[i] = SM_SentinelUndef;
5777       continue;
5778     }
5779
5780     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5781     // TODO: We currently only set UNDEF for integer types - floats use the same
5782     // registers as vectors and many of the scalar folded loads rely on the
5783     // SCALAR_TO_VECTOR pattern.
5784     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5785         (Size % V.getValueType().getVectorNumElements()) == 0) {
5786       int Scale = Size / V.getValueType().getVectorNumElements();
5787       int Idx = M / Scale;
5788       if (Idx != 0 && !VT.isFloatingPoint())
5789         Mask[i] = SM_SentinelUndef;
5790       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5791         Mask[i] = SM_SentinelZero;
5792       continue;
5793     }
5794
5795     // Attempt to extract from the source's constant bits.
5796     if (IsSrcConstant[SrcIdx]) {
5797       if (UndefSrcElts[SrcIdx][M])
5798         Mask[i] = SM_SentinelUndef;
5799       else if (SrcEltBits[SrcIdx][M] == 0)
5800         Mask[i] = SM_SentinelZero;
5801     }
5802   }
5803
5804   assert(VT.getVectorNumElements() == Mask.size() &&
5805          "Different mask size from vector size!");
5806   return true;
5807 }
5808
5809 // Attempt to decode ops that could be represented as a shuffle mask.
5810 // The decoded shuffle mask may contain a different number of elements to the
5811 // destination value type.
5812 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5813                                SmallVectorImpl<SDValue> &Ops) {
5814   Mask.clear();
5815   Ops.clear();
5816
5817   MVT VT = N.getSimpleValueType();
5818   unsigned NumElts = VT.getVectorNumElements();
5819   unsigned NumSizeInBits = VT.getSizeInBits();
5820   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5821   assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5822          "Expected byte aligned value types");
5823
5824   unsigned Opcode = N.getOpcode();
5825   switch (Opcode) {
5826   case ISD::AND:
5827   case X86ISD::ANDNP: {
5828     // Attempt to decode as a per-byte mask.
5829     APInt UndefElts;
5830     SmallVector<APInt, 32> EltBits;
5831     SDValue N0 = N.getOperand(0);
5832     SDValue N1 = N.getOperand(1);
5833     bool IsAndN = (X86ISD::ANDNP == Opcode);
5834     uint64_t ZeroMask = IsAndN ? 255 : 0;
5835     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5836       return false;
5837     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5838       if (UndefElts[i]) {
5839         Mask.push_back(SM_SentinelUndef);
5840         continue;
5841       }
5842       uint64_t ByteBits = EltBits[i].getZExtValue();
5843       if (ByteBits != 0 && ByteBits != 255)
5844         return false;
5845       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5846     }
5847     Ops.push_back(IsAndN ? N1 : N0);
5848     return true;
5849   }
5850   case ISD::SCALAR_TO_VECTOR: {
5851     // Match against a scalar_to_vector of an extract from a similar vector.
5852     SDValue N0 = N.getOperand(0);
5853     if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5854         N0.getOperand(0).getValueType() != VT ||
5855         !isa<ConstantSDNode>(N0.getOperand(1)) ||
5856         NumElts <= N0.getConstantOperandVal(1) ||
5857         !N->isOnlyUserOf(N0.getNode()))
5858       return false;
5859     Ops.push_back(N0.getOperand(0));
5860     Mask.push_back(N0.getConstantOperandVal(1));
5861     Mask.append(NumElts - 1, SM_SentinelUndef);
5862     return true;
5863   }
5864   case X86ISD::PINSRB:
5865   case X86ISD::PINSRW: {
5866     SDValue InVec = N.getOperand(0);
5867     SDValue InScl = N.getOperand(1);
5868     uint64_t InIdx = N.getConstantOperandVal(2);
5869     assert(InIdx < NumElts && "Illegal insertion index");
5870
5871     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5872     if (X86::isZeroNode(InScl)) {
5873       Ops.push_back(InVec);
5874       for (unsigned i = 0; i != NumElts; ++i)
5875         Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5876       return true;
5877     }
5878
5879     // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
5880     // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5881     unsigned ExOp =
5882         (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5883     if (InScl.getOpcode() != ISD::AssertZext ||
5884         InScl.getOperand(0).getOpcode() != ExOp)
5885       return false;
5886
5887     SDValue ExVec = InScl.getOperand(0).getOperand(0);
5888     uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
5889     assert(ExIdx < NumElts && "Illegal extraction index");
5890     Ops.push_back(InVec);
5891     Ops.push_back(ExVec);
5892     for (unsigned i = 0; i != NumElts; ++i)
5893       Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5894     return true;
5895   }
5896   case X86ISD::VSHLI:
5897   case X86ISD::VSRLI: {
5898     uint64_t ShiftVal = N.getConstantOperandVal(1);
5899     // Out of range bit shifts are guaranteed to be zero.
5900     if (NumBitsPerElt <= ShiftVal) {
5901       Mask.append(NumElts, SM_SentinelZero);
5902       return true;
5903     }
5904
5905     // We can only decode 'whole byte' bit shifts as shuffles.
5906     if ((ShiftVal % 8) != 0)
5907       break;
5908
5909     uint64_t ByteShift = ShiftVal / 8;
5910     unsigned NumBytes = NumSizeInBits / 8;
5911     unsigned NumBytesPerElt = NumBitsPerElt / 8;
5912     Ops.push_back(N.getOperand(0));
5913
5914     // Clear mask to all zeros and insert the shifted byte indices.
5915     Mask.append(NumBytes, SM_SentinelZero);
5916
5917     if (X86ISD::VSHLI == Opcode) {
5918       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5919         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5920           Mask[i + j] = i + j - ByteShift;
5921     } else {
5922       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5923         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5924           Mask[i + j - ByteShift] = i + j;
5925     }
5926     return true;
5927   }
5928   case ISD::ZERO_EXTEND_VECTOR_INREG:
5929   case X86ISD::VZEXT: {
5930     // TODO - add support for VPMOVZX with smaller input vector types.
5931     SDValue Src = N.getOperand(0);
5932     MVT SrcVT = Src.getSimpleValueType();
5933     if (NumSizeInBits != SrcVT.getSizeInBits())
5934       break;
5935     DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5936     Ops.push_back(Src);
5937     return true;
5938   }
5939   }
5940
5941   return false;
5942 }
5943
5944 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
5945 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
5946                                               SmallVectorImpl<int> &Mask) {
5947   int MaskWidth = Mask.size();
5948   SmallVector<SDValue, 16> UsedInputs;
5949   for (int i = 0, e = Inputs.size(); i < e; ++i) {
5950     int lo = UsedInputs.size() * MaskWidth;
5951     int hi = lo + MaskWidth;
5952     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
5953       UsedInputs.push_back(Inputs[i]);
5954       continue;
5955     }
5956     for (int &M : Mask)
5957       if (lo <= M)
5958         M -= MaskWidth;
5959   }
5960   Inputs = UsedInputs;
5961 }
5962
5963 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5964 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5965 /// remaining input indices in case we now have a unary shuffle and adjust the
5966 /// inputs accordingly.
5967 /// Returns true if the target shuffle mask was decoded.
5968 static bool resolveTargetShuffleInputs(SDValue Op,
5969                                        SmallVectorImpl<SDValue> &Inputs,
5970                                        SmallVectorImpl<int> &Mask) {
5971   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
5972     if (!getFauxShuffleMask(Op, Mask, Inputs))
5973       return false;
5974
5975   resolveTargetShuffleInputsAndMask(Inputs, Mask);
5976   return true;
5977 }
5978
5979 /// Returns the scalar element that will make up the ith
5980 /// element of the result of the vector shuffle.
5981 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5982                                    unsigned Depth) {
5983   if (Depth == 6)
5984     return SDValue();  // Limit search depth.
5985
5986   SDValue V = SDValue(N, 0);
5987   EVT VT = V.getValueType();
5988   unsigned Opcode = V.getOpcode();
5989
5990   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5991   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5992     int Elt = SV->getMaskElt(Index);
5993
5994     if (Elt < 0)
5995       return DAG.getUNDEF(VT.getVectorElementType());
5996
5997     unsigned NumElems = VT.getVectorNumElements();
5998     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5999                                          : SV->getOperand(1);
6000     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6001   }
6002
6003   // Recurse into target specific vector shuffles to find scalars.
6004   if (isTargetShuffle(Opcode)) {
6005     MVT ShufVT = V.getSimpleValueType();
6006     MVT ShufSVT = ShufVT.getVectorElementType();
6007     int NumElems = (int)ShufVT.getVectorNumElements();
6008     SmallVector<int, 16> ShuffleMask;
6009     SmallVector<SDValue, 16> ShuffleOps;
6010     bool IsUnary;
6011
6012     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6013       return SDValue();
6014
6015     int Elt = ShuffleMask[Index];
6016     if (Elt == SM_SentinelZero)
6017       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6018                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6019     if (Elt == SM_SentinelUndef)
6020       return DAG.getUNDEF(ShufSVT);
6021
6022     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6023     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6024     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6025                                Depth+1);
6026   }
6027
6028   // Actual nodes that may contain scalar elements
6029   if (Opcode == ISD::BITCAST) {
6030     V = V.getOperand(0);
6031     EVT SrcVT = V.getValueType();
6032     unsigned NumElems = VT.getVectorNumElements();
6033
6034     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6035       return SDValue();
6036   }
6037
6038   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6039     return (Index == 0) ? V.getOperand(0)
6040                         : DAG.getUNDEF(VT.getVectorElementType());
6041
6042   if (V.getOpcode() == ISD::BUILD_VECTOR)
6043     return V.getOperand(Index);
6044
6045   return SDValue();
6046 }
6047
6048 /// Custom lower build_vector of v16i8.
6049 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6050                                      unsigned NumNonZero, unsigned NumZero,
6051                                      SelectionDAG &DAG,
6052                                      const X86Subtarget &Subtarget) {
6053   if (NumNonZero > 8 && !Subtarget.hasSSE41())
6054     return SDValue();
6055
6056   SDLoc dl(Op);
6057   SDValue V;
6058   bool First = true;
6059
6060   // SSE4.1 - use PINSRB to insert each byte directly.
6061   if (Subtarget.hasSSE41()) {
6062     for (unsigned i = 0; i < 16; ++i) {
6063       bool IsNonZero = (NonZeros & (1 << i)) != 0;
6064       if (IsNonZero) {
6065         // If the build vector contains zeros or our first insertion is not the
6066         // first index then insert into zero vector to break any register
6067         // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6068         if (First) {
6069           First = false;
6070           if (NumZero || 0 != i)
6071             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6072           else {
6073             assert(0 == i && "Expected insertion into zero-index");
6074             V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6075             V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6076             V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6077             V = DAG.getBitcast(MVT::v16i8, V);
6078             continue;
6079           }
6080         }
6081         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6082                         Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6083       }
6084     }
6085
6086     return V;
6087   }
6088
6089   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6090   for (unsigned i = 0; i < 16; ++i) {
6091     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6092     if (ThisIsNonZero && First) {
6093       if (NumZero)
6094         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6095       else
6096         V = DAG.getUNDEF(MVT::v8i16);
6097       First = false;
6098     }
6099
6100     if ((i & 1) != 0) {
6101       // FIXME: Investigate extending to i32 instead of just i16.
6102       // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6103       SDValue ThisElt, LastElt;
6104       bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6105       if (LastIsNonZero) {
6106         LastElt =
6107             DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6108       }
6109       if (ThisIsNonZero) {
6110         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6111         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6112                               DAG.getConstant(8, dl, MVT::i8));
6113         if (LastIsNonZero)
6114           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6115       } else
6116         ThisElt = LastElt;
6117
6118       if (ThisElt) {
6119         if (1 == i) {
6120           V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6121                       : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6122           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6123           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6124           V = DAG.getBitcast(MVT::v8i16, V);
6125         } else {
6126           V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6127                           DAG.getIntPtrConstant(i / 2, dl));
6128         }
6129       }
6130     }
6131   }
6132
6133   return DAG.getBitcast(MVT::v16i8, V);
6134 }
6135
6136 /// Custom lower build_vector of v8i16.
6137 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6138                                      unsigned NumNonZero, unsigned NumZero,
6139                                      SelectionDAG &DAG,
6140                                      const X86Subtarget &Subtarget) {
6141   if (NumNonZero > 4 && !Subtarget.hasSSE41())
6142     return SDValue();
6143
6144   SDLoc dl(Op);
6145   SDValue V;
6146   bool First = true;
6147   for (unsigned i = 0; i < 8; ++i) {
6148     bool IsNonZero = (NonZeros & (1 << i)) != 0;
6149     if (IsNonZero) {
6150       // If the build vector contains zeros or our first insertion is not the
6151       // first index then insert into zero vector to break any register
6152       // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6153       if (First) {
6154         First = false;
6155         if (NumZero || 0 != i)
6156           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6157         else {
6158           assert(0 == i && "Expected insertion into zero-index");
6159           V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6160           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6161           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6162           V = DAG.getBitcast(MVT::v8i16, V);
6163           continue;
6164         }
6165       }
6166       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6167                       Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6168     }
6169   }
6170
6171   return V;
6172 }
6173
6174 /// Custom lower build_vector of v4i32 or v4f32.
6175 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6176                                      const X86Subtarget &Subtarget) {
6177   // Find all zeroable elements.
6178   std::bitset<4> Zeroable;
6179   for (int i=0; i < 4; ++i) {
6180     SDValue Elt = Op->getOperand(i);
6181     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6182   }
6183   assert(Zeroable.size() - Zeroable.count() > 1 &&
6184          "We expect at least two non-zero elements!");
6185
6186   // We only know how to deal with build_vector nodes where elements are either
6187   // zeroable or extract_vector_elt with constant index.
6188   SDValue FirstNonZero;
6189   unsigned FirstNonZeroIdx;
6190   for (unsigned i=0; i < 4; ++i) {
6191     if (Zeroable[i])
6192       continue;
6193     SDValue Elt = Op->getOperand(i);
6194     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6195         !isa<ConstantSDNode>(Elt.getOperand(1)))
6196       return SDValue();
6197     // Make sure that this node is extracting from a 128-bit vector.
6198     MVT VT = Elt.getOperand(0).getSimpleValueType();
6199     if (!VT.is128BitVector())
6200       return SDValue();
6201     if (!FirstNonZero.getNode()) {
6202       FirstNonZero = Elt;
6203       FirstNonZeroIdx = i;
6204     }
6205   }
6206
6207   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6208   SDValue V1 = FirstNonZero.getOperand(0);
6209   MVT VT = V1.getSimpleValueType();
6210
6211   // See if this build_vector can be lowered as a blend with zero.
6212   SDValue Elt;
6213   unsigned EltMaskIdx, EltIdx;
6214   int Mask[4];
6215   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6216     if (Zeroable[EltIdx]) {
6217       // The zero vector will be on the right hand side.
6218       Mask[EltIdx] = EltIdx+4;
6219       continue;
6220     }
6221
6222     Elt = Op->getOperand(EltIdx);
6223     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6224     EltMaskIdx = Elt.getConstantOperandVal(1);
6225     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6226       break;
6227     Mask[EltIdx] = EltIdx;
6228   }
6229
6230   if (EltIdx == 4) {
6231     // Let the shuffle legalizer deal with blend operations.
6232     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6233     if (V1.getSimpleValueType() != VT)
6234       V1 = DAG.getBitcast(VT, V1);
6235     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6236   }
6237
6238   // See if we can lower this build_vector to a INSERTPS.
6239   if (!Subtarget.hasSSE41())
6240     return SDValue();
6241
6242   SDValue V2 = Elt.getOperand(0);
6243   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6244     V1 = SDValue();
6245
6246   bool CanFold = true;
6247   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6248     if (Zeroable[i])
6249       continue;
6250
6251     SDValue Current = Op->getOperand(i);
6252     SDValue SrcVector = Current->getOperand(0);
6253     if (!V1.getNode())
6254       V1 = SrcVector;
6255     CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6256   }
6257
6258   if (!CanFold)
6259     return SDValue();
6260
6261   assert(V1.getNode() && "Expected at least two non-zero elements!");
6262   if (V1.getSimpleValueType() != MVT::v4f32)
6263     V1 = DAG.getBitcast(MVT::v4f32, V1);
6264   if (V2.getSimpleValueType() != MVT::v4f32)
6265     V2 = DAG.getBitcast(MVT::v4f32, V2);
6266
6267   // Ok, we can emit an INSERTPS instruction.
6268   unsigned ZMask = Zeroable.to_ulong();
6269
6270   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6271   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6272   SDLoc DL(Op);
6273   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6274                                DAG.getIntPtrConstant(InsertPSMask, DL));
6275   return DAG.getBitcast(VT, Result);
6276 }
6277
6278 /// Return a vector logical shift node.
6279 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6280                          SelectionDAG &DAG, const TargetLowering &TLI,
6281                          const SDLoc &dl) {
6282   assert(VT.is128BitVector() && "Unknown type for VShift");
6283   MVT ShVT = MVT::v16i8;
6284   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6285   SrcOp = DAG.getBitcast(ShVT, SrcOp);
6286   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6287   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6288   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6289   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6290 }
6291
6292 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6293                                       SelectionDAG &DAG) {
6294
6295   // Check if the scalar load can be widened into a vector load. And if
6296   // the address is "base + cst" see if the cst can be "absorbed" into
6297   // the shuffle mask.
6298   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6299     SDValue Ptr = LD->getBasePtr();
6300     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6301       return SDValue();
6302     EVT PVT = LD->getValueType(0);
6303     if (PVT != MVT::i32 && PVT != MVT::f32)
6304       return SDValue();
6305
6306     int FI = -1;
6307     int64_t Offset = 0;
6308     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6309       FI = FINode->getIndex();
6310       Offset = 0;
6311     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6312                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6313       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6314       Offset = Ptr.getConstantOperandVal(1);
6315       Ptr = Ptr.getOperand(0);
6316     } else {
6317       return SDValue();
6318     }
6319
6320     // FIXME: 256-bit vector instructions don't require a strict alignment,
6321     // improve this code to support it better.
6322     unsigned RequiredAlign = VT.getSizeInBits()/8;
6323     SDValue Chain = LD->getChain();
6324     // Make sure the stack object alignment is at least 16 or 32.
6325     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6326     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6327       if (MFI.isFixedObjectIndex(FI)) {
6328         // Can't change the alignment. FIXME: It's possible to compute
6329         // the exact stack offset and reference FI + adjust offset instead.
6330         // If someone *really* cares about this. That's the way to implement it.
6331         return SDValue();
6332       } else {
6333         MFI.setObjectAlignment(FI, RequiredAlign);
6334       }
6335     }
6336
6337     // (Offset % 16 or 32) must be multiple of 4. Then address is then
6338     // Ptr + (Offset & ~15).
6339     if (Offset < 0)
6340       return SDValue();
6341     if ((Offset % RequiredAlign) & 3)
6342       return SDValue();
6343     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6344     if (StartOffset) {
6345       SDLoc DL(Ptr);
6346       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6347                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6348     }
6349
6350     int EltNo = (Offset - StartOffset) >> 2;
6351     unsigned NumElems = VT.getVectorNumElements();
6352
6353     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6354     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6355                              LD->getPointerInfo().getWithOffset(StartOffset));
6356
6357     SmallVector<int, 8> Mask(NumElems, EltNo);
6358
6359     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6360   }
6361
6362   return SDValue();
6363 }
6364
6365 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6366 /// elements can be replaced by a single large load which has the same value as
6367 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6368 ///
6369 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6370 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6371                                         const SDLoc &DL, SelectionDAG &DAG,
6372                                         bool isAfterLegalize) {
6373   unsigned NumElems = Elts.size();
6374
6375   int LastLoadedElt = -1;
6376   SmallBitVector LoadMask(NumElems, false);
6377   SmallBitVector ZeroMask(NumElems, false);
6378   SmallBitVector UndefMask(NumElems, false);
6379
6380   // For each element in the initializer, see if we've found a load, zero or an
6381   // undef.
6382   for (unsigned i = 0; i < NumElems; ++i) {
6383     SDValue Elt = peekThroughBitcasts(Elts[i]);
6384     if (!Elt.getNode())
6385       return SDValue();
6386
6387     if (Elt.isUndef())
6388       UndefMask[i] = true;
6389     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6390       ZeroMask[i] = true;
6391     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6392       LoadMask[i] = true;
6393       LastLoadedElt = i;
6394       // Each loaded element must be the correct fractional portion of the
6395       // requested vector load.
6396       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6397         return SDValue();
6398     } else
6399       return SDValue();
6400   }
6401   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6402          "Incomplete element masks");
6403
6404   // Handle Special Cases - all undef or undef/zero.
6405   if (UndefMask.count() == NumElems)
6406     return DAG.getUNDEF(VT);
6407
6408   // FIXME: Should we return this as a BUILD_VECTOR instead?
6409   if ((ZeroMask | UndefMask).count() == NumElems)
6410     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6411                           : DAG.getConstantFP(0.0, DL, VT);
6412
6413   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6414   int FirstLoadedElt = LoadMask.find_first();
6415   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6416   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6417   EVT LDBaseVT = EltBase.getValueType();
6418
6419   // Consecutive loads can contain UNDEFS but not ZERO elements.
6420   // Consecutive loads with UNDEFs and ZEROs elements require a
6421   // an additional shuffle stage to clear the ZERO elements.
6422   bool IsConsecutiveLoad = true;
6423   bool IsConsecutiveLoadWithZeros = true;
6424   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6425     if (LoadMask[i]) {
6426       SDValue Elt = peekThroughBitcasts(Elts[i]);
6427       LoadSDNode *LD = cast<LoadSDNode>(Elt);
6428       if (!DAG.areNonVolatileConsecutiveLoads(
6429               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6430               i - FirstLoadedElt)) {
6431         IsConsecutiveLoad = false;
6432         IsConsecutiveLoadWithZeros = false;
6433         break;
6434       }
6435     } else if (ZeroMask[i]) {
6436       IsConsecutiveLoad = false;
6437     }
6438   }
6439
6440   auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6441     auto MMOFlags = LDBase->getMemOperand()->getFlags();
6442     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6443            "Cannot merge volatile loads.");
6444     SDValue NewLd =
6445         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6446                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6447
6448     if (LDBase->hasAnyUseOfValue(1)) {
6449       SDValue NewChain =
6450           DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6451                       SDValue(NewLd.getNode(), 1));
6452       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6453       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6454                              SDValue(NewLd.getNode(), 1));
6455     }
6456
6457     return NewLd;
6458   };
6459
6460   // LOAD - all consecutive load/undefs (must start/end with a load).
6461   // If we have found an entire vector of loads and undefs, then return a large
6462   // load of the entire vector width starting at the base pointer.
6463   // If the vector contains zeros, then attempt to shuffle those elements.
6464   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6465       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6466     assert(LDBase && "Did not find base load for merging consecutive loads");
6467     EVT EltVT = LDBase->getValueType(0);
6468     // Ensure that the input vector size for the merged loads matches the
6469     // cumulative size of the input elements.
6470     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6471       return SDValue();
6472
6473     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6474       return SDValue();
6475
6476     if (IsConsecutiveLoad)
6477       return CreateLoad(VT, LDBase);
6478
6479     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6480     // vector and a zero vector to clear out the zero elements.
6481     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6482       SmallVector<int, 4> ClearMask(NumElems, -1);
6483       for (unsigned i = 0; i < NumElems; ++i) {
6484         if (ZeroMask[i])
6485           ClearMask[i] = i + NumElems;
6486         else if (LoadMask[i])
6487           ClearMask[i] = i;
6488       }
6489       SDValue V = CreateLoad(VT, LDBase);
6490       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6491                                  : DAG.getConstantFP(0.0, DL, VT);
6492       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6493     }
6494   }
6495
6496   int LoadSize =
6497       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6498
6499   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6500   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6501       (LoadSize == 32 || LoadSize == 64) &&
6502       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6503     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6504                                       : MVT::getIntegerVT(LoadSize);
6505     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6506     if (TLI.isTypeLegal(VecVT)) {
6507       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6508       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6509       SDValue ResNode =
6510           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6511                                   LDBase->getPointerInfo(),
6512                                   LDBase->getAlignment(),
6513                                   false/*isVolatile*/, true/*ReadMem*/,
6514                                   false/*WriteMem*/);
6515
6516       // Make sure the newly-created LOAD is in the same position as LDBase in
6517       // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6518       // and update uses of LDBase's output chain to use the TokenFactor.
6519       if (LDBase->hasAnyUseOfValue(1)) {
6520         SDValue NewChain =
6521             DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6522                         SDValue(ResNode.getNode(), 1));
6523         DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6524         DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6525                                SDValue(ResNode.getNode(), 1));
6526       }
6527
6528       return DAG.getBitcast(VT, ResNode);
6529     }
6530   }
6531
6532   return SDValue();
6533 }
6534
6535 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6536                                    unsigned SplatBitSize, LLVMContext &C) {
6537   unsigned ScalarSize = VT.getScalarSizeInBits();
6538   unsigned NumElm = SplatBitSize / ScalarSize;
6539
6540   SmallVector<Constant *, 32> ConstantVec;
6541   for (unsigned i = 0; i < NumElm; i++) {
6542     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6543     Constant *Const;
6544     if (VT.isFloatingPoint()) {
6545       assert((ScalarSize == 32 || ScalarSize == 64) &&
6546              "Unsupported floating point scalar size");
6547       if (ScalarSize == 32)
6548         Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6549       else
6550         Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6551     } else
6552       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6553     ConstantVec.push_back(Const);
6554   }
6555   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6556 }
6557
6558 static bool isUseOfShuffle(SDNode *N) {
6559   for (auto *U : N->uses()) {
6560     if (isTargetShuffle(U->getOpcode()))
6561       return true;
6562     if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6563       return isUseOfShuffle(U);
6564   }
6565   return false;
6566 }
6567
6568 /// Attempt to use the vbroadcast instruction to generate a splat value
6569 /// from a splat BUILD_VECTOR which uses:
6570 ///  a. A single scalar load, or a constant.
6571 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6572 ///
6573 /// The VBROADCAST node is returned when a pattern is found,
6574 /// or SDValue() otherwise.
6575 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6576                                            const X86Subtarget &Subtarget,
6577                                            SelectionDAG &DAG) {
6578   // VBROADCAST requires AVX.
6579   // TODO: Splats could be generated for non-AVX CPUs using SSE
6580   // instructions, but there's less potential gain for only 128-bit vectors.
6581   if (!Subtarget.hasAVX())
6582     return SDValue();
6583
6584   MVT VT = BVOp->getSimpleValueType(0);
6585   SDLoc dl(BVOp);
6586
6587   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6588          "Unsupported vector type for broadcast.");
6589
6590   BitVector UndefElements;
6591   SDValue Ld = BVOp->getSplatValue(&UndefElements);
6592
6593   // We need a splat of a single value to use broadcast, and it doesn't
6594   // make any sense if the value is only in one element of the vector.
6595   if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6596     APInt SplatValue, Undef;
6597     unsigned SplatBitSize;
6598     bool HasUndef;
6599     // Check if this is a repeated constant pattern suitable for broadcasting.
6600     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6601         SplatBitSize > VT.getScalarSizeInBits() &&
6602         SplatBitSize < VT.getSizeInBits()) {
6603       // Avoid replacing with broadcast when it's a use of a shuffle
6604       // instruction to preserve the present custom lowering of shuffles.
6605       if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6606         return SDValue();
6607       // replace BUILD_VECTOR with broadcast of the repeated constants.
6608       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6609       LLVMContext *Ctx = DAG.getContext();
6610       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6611       if (Subtarget.hasAVX()) {
6612         if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6613             !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6614           // Splatted value can fit in one INTEGER constant in constant pool.
6615           // Load the constant and broadcast it.
6616           MVT CVT = MVT::getIntegerVT(SplatBitSize);
6617           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6618           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6619           SDValue CP = DAG.getConstantPool(C, PVT);
6620           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6621
6622           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6623           Ld = DAG.getLoad(
6624               CVT, dl, DAG.getEntryNode(), CP,
6625               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6626               Alignment);
6627           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6628                                        MVT::getVectorVT(CVT, Repeat), Ld);
6629           return DAG.getBitcast(VT, Brdcst);
6630         } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6631           // Splatted value can fit in one FLOAT constant in constant pool.
6632           // Load the constant and broadcast it.
6633           // AVX have support for 32 and 64 bit broadcast for floats only.
6634           // No 64bit integer in 32bit subtarget.
6635           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6636           Constant *C = SplatBitSize == 32
6637                             ? ConstantFP::get(Type::getFloatTy(*Ctx),
6638                                               SplatValue.bitsToFloat())
6639                             : ConstantFP::get(Type::getDoubleTy(*Ctx),
6640                                               SplatValue.bitsToDouble());
6641           SDValue CP = DAG.getConstantPool(C, PVT);
6642           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6643
6644           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6645           Ld = DAG.getLoad(
6646               CVT, dl, DAG.getEntryNode(), CP,
6647               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6648               Alignment);
6649           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6650                                        MVT::getVectorVT(CVT, Repeat), Ld);
6651           return DAG.getBitcast(VT, Brdcst);
6652         } else if (SplatBitSize > 64) {
6653           // Load the vector of constants and broadcast it.
6654           MVT CVT = VT.getScalarType();
6655           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6656                                              *Ctx);
6657           SDValue VCP = DAG.getConstantPool(VecC, PVT);
6658           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6659           unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6660           Ld = DAG.getLoad(
6661               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6662               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6663               Alignment);
6664           SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6665           return DAG.getBitcast(VT, Brdcst);
6666         }
6667       }
6668     }
6669     return SDValue();
6670   }
6671
6672   bool ConstSplatVal =
6673       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6674
6675   // Make sure that all of the users of a non-constant load are from the
6676   // BUILD_VECTOR node.
6677   if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6678     return SDValue();
6679
6680   unsigned ScalarSize = Ld.getValueSizeInBits();
6681   bool IsGE256 = (VT.getSizeInBits() >= 256);
6682
6683   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6684   // instruction to save 8 or more bytes of constant pool data.
6685   // TODO: If multiple splats are generated to load the same constant,
6686   // it may be detrimental to overall size. There needs to be a way to detect
6687   // that condition to know if this is truly a size win.
6688   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6689
6690   // Handle broadcasting a single constant scalar from the constant pool
6691   // into a vector.
6692   // On Sandybridge (no AVX2), it is still better to load a constant vector
6693   // from the constant pool and not to broadcast it from a scalar.
6694   // But override that restriction when optimizing for size.
6695   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6696   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6697     EVT CVT = Ld.getValueType();
6698     assert(!CVT.isVector() && "Must not broadcast a vector type");
6699
6700     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6701     // For size optimization, also splat v2f64 and v2i64, and for size opt
6702     // with AVX2, also splat i8 and i16.
6703     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6704     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6705         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6706       const Constant *C = nullptr;
6707       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6708         C = CI->getConstantIntValue();
6709       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6710         C = CF->getConstantFPValue();
6711
6712       assert(C && "Invalid constant type");
6713
6714       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6715       SDValue CP =
6716           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6717       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6718       Ld = DAG.getLoad(
6719           CVT, dl, DAG.getEntryNode(), CP,
6720           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6721           Alignment);
6722
6723       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6724     }
6725   }
6726
6727   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6728
6729   // Handle AVX2 in-register broadcasts.
6730   if (!IsLoad && Subtarget.hasInt256() &&
6731       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6732     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6733
6734   // The scalar source must be a normal load.
6735   if (!IsLoad)
6736     return SDValue();
6737
6738   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6739       (Subtarget.hasVLX() && ScalarSize == 64))
6740     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6741
6742   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6743   // double since there is no vbroadcastsd xmm
6744   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6745     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6746       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6747   }
6748
6749   // Unsupported broadcast.
6750   return SDValue();
6751 }
6752
6753 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6754 /// underlying vector and index.
6755 ///
6756 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6757 /// index.
6758 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6759                                          SDValue ExtIdx) {
6760   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6761   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6762     return Idx;
6763
6764   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6765   // lowered this:
6766   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6767   // to:
6768   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6769   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6770   //                           undef)
6771   //                       Constant<0>)
6772   // In this case the vector is the extract_subvector expression and the index
6773   // is 2, as specified by the shuffle.
6774   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6775   SDValue ShuffleVec = SVOp->getOperand(0);
6776   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6777   assert(ShuffleVecVT.getVectorElementType() ==
6778          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6779
6780   int ShuffleIdx = SVOp->getMaskElt(Idx);
6781   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6782     ExtractedFromVec = ShuffleVec;
6783     return ShuffleIdx;
6784   }
6785   return Idx;
6786 }
6787
6788 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6789   MVT VT = Op.getSimpleValueType();
6790
6791   // Skip if insert_vec_elt is not supported.
6792   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6793   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6794     return SDValue();
6795
6796   SDLoc DL(Op);
6797   unsigned NumElems = Op.getNumOperands();
6798
6799   SDValue VecIn1;
6800   SDValue VecIn2;
6801   SmallVector<unsigned, 4> InsertIndices;
6802   SmallVector<int, 8> Mask(NumElems, -1);
6803
6804   for (unsigned i = 0; i != NumElems; ++i) {
6805     unsigned Opc = Op.getOperand(i).getOpcode();
6806
6807     if (Opc == ISD::UNDEF)
6808       continue;
6809
6810     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6811       // Quit if more than 1 elements need inserting.
6812       if (InsertIndices.size() > 1)
6813         return SDValue();
6814
6815       InsertIndices.push_back(i);
6816       continue;
6817     }
6818
6819     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6820     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6821
6822     // Quit if non-constant index.
6823     if (!isa<ConstantSDNode>(ExtIdx))
6824       return SDValue();
6825     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6826
6827     // Quit if extracted from vector of different type.
6828     if (ExtractedFromVec.getValueType() != VT)
6829       return SDValue();
6830
6831     if (!VecIn1.getNode())
6832       VecIn1 = ExtractedFromVec;
6833     else if (VecIn1 != ExtractedFromVec) {
6834       if (!VecIn2.getNode())
6835         VecIn2 = ExtractedFromVec;
6836       else if (VecIn2 != ExtractedFromVec)
6837         // Quit if more than 2 vectors to shuffle
6838         return SDValue();
6839     }
6840
6841     if (ExtractedFromVec == VecIn1)
6842       Mask[i] = Idx;
6843     else if (ExtractedFromVec == VecIn2)
6844       Mask[i] = Idx + NumElems;
6845   }
6846
6847   if (!VecIn1.getNode())
6848     return SDValue();
6849
6850   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6851   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6852
6853   for (unsigned Idx : InsertIndices)
6854     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6855                      DAG.getIntPtrConstant(Idx, DL));
6856
6857   return NV;
6858 }
6859
6860 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6861   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6862          Op.getScalarValueSizeInBits() == 1 &&
6863          "Can not convert non-constant vector");
6864   uint64_t Immediate = 0;
6865   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6866     SDValue In = Op.getOperand(idx);
6867     if (!In.isUndef())
6868       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6869   }
6870   SDLoc dl(Op);
6871   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6872   return DAG.getConstant(Immediate, dl, VT);
6873 }
6874 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6875 SDValue
6876 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6877
6878   MVT VT = Op.getSimpleValueType();
6879   assert((VT.getVectorElementType() == MVT::i1) &&
6880          "Unexpected type in LowerBUILD_VECTORvXi1!");
6881
6882   SDLoc dl(Op);
6883   if (ISD::isBuildVectorAllZeros(Op.getNode()))
6884     return DAG.getTargetConstant(0, dl, VT);
6885
6886   if (ISD::isBuildVectorAllOnes(Op.getNode()))
6887     return DAG.getTargetConstant(1, dl, VT);
6888
6889   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6890     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6891     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6892       return DAG.getBitcast(VT, Imm);
6893     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6894     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6895                         DAG.getIntPtrConstant(0, dl));
6896   }
6897
6898   // Vector has one or more non-const elements
6899   uint64_t Immediate = 0;
6900   SmallVector<unsigned, 16> NonConstIdx;
6901   bool IsSplat = true;
6902   bool HasConstElts = false;
6903   int SplatIdx = -1;
6904   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6905     SDValue In = Op.getOperand(idx);
6906     if (In.isUndef())
6907       continue;
6908     if (!isa<ConstantSDNode>(In))
6909       NonConstIdx.push_back(idx);
6910     else {
6911       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
6912       HasConstElts = true;
6913     }
6914     if (SplatIdx < 0)
6915       SplatIdx = idx;
6916     else if (In != Op.getOperand(SplatIdx))
6917       IsSplat = false;
6918   }
6919
6920   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6921   if (IsSplat)
6922     return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
6923                          DAG.getConstant(1, dl, VT),
6924                          DAG.getConstant(0, dl, VT));
6925
6926   // insert elements one by one
6927   SDValue DstVec;
6928   SDValue Imm;
6929   if (Immediate) {
6930     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6931     Imm = DAG.getConstant(Immediate, dl, ImmVT);
6932   }
6933   else if (HasConstElts)
6934     Imm = DAG.getConstant(0, dl, VT);
6935   else
6936     Imm = DAG.getUNDEF(VT);
6937   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6938     DstVec = DAG.getBitcast(VT, Imm);
6939   else {
6940     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6941     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6942                          DAG.getIntPtrConstant(0, dl));
6943   }
6944
6945   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6946     unsigned InsertIdx = NonConstIdx[i];
6947     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6948                          Op.getOperand(InsertIdx),
6949                          DAG.getIntPtrConstant(InsertIdx, dl));
6950   }
6951   return DstVec;
6952 }
6953
6954 /// \brief Return true if \p N implements a horizontal binop and return the
6955 /// operands for the horizontal binop into V0 and V1.
6956 ///
6957 /// This is a helper function of LowerToHorizontalOp().
6958 /// This function checks that the build_vector \p N in input implements a
6959 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6960 /// operation to match.
6961 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6962 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6963 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6964 /// arithmetic sub.
6965 ///
6966 /// This function only analyzes elements of \p N whose indices are
6967 /// in range [BaseIdx, LastIdx).
6968 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6969                               SelectionDAG &DAG,
6970                               unsigned BaseIdx, unsigned LastIdx,
6971                               SDValue &V0, SDValue &V1) {
6972   EVT VT = N->getValueType(0);
6973
6974   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6975   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6976          "Invalid Vector in input!");
6977
6978   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6979   bool CanFold = true;
6980   unsigned ExpectedVExtractIdx = BaseIdx;
6981   unsigned NumElts = LastIdx - BaseIdx;
6982   V0 = DAG.getUNDEF(VT);
6983   V1 = DAG.getUNDEF(VT);
6984
6985   // Check if N implements a horizontal binop.
6986   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6987     SDValue Op = N->getOperand(i + BaseIdx);
6988
6989     // Skip UNDEFs.
6990     if (Op->isUndef()) {
6991       // Update the expected vector extract index.
6992       if (i * 2 == NumElts)
6993         ExpectedVExtractIdx = BaseIdx;
6994       ExpectedVExtractIdx += 2;
6995       continue;
6996     }
6997
6998     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6999
7000     if (!CanFold)
7001       break;
7002
7003     SDValue Op0 = Op.getOperand(0);
7004     SDValue Op1 = Op.getOperand(1);
7005
7006     // Try to match the following pattern:
7007     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7008     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7009         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7010         Op0.getOperand(0) == Op1.getOperand(0) &&
7011         isa<ConstantSDNode>(Op0.getOperand(1)) &&
7012         isa<ConstantSDNode>(Op1.getOperand(1)));
7013     if (!CanFold)
7014       break;
7015
7016     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7017     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7018
7019     if (i * 2 < NumElts) {
7020       if (V0.isUndef()) {
7021         V0 = Op0.getOperand(0);
7022         if (V0.getValueType() != VT)
7023           return false;
7024       }
7025     } else {
7026       if (V1.isUndef()) {
7027         V1 = Op0.getOperand(0);
7028         if (V1.getValueType() != VT)
7029           return false;
7030       }
7031       if (i * 2 == NumElts)
7032         ExpectedVExtractIdx = BaseIdx;
7033     }
7034
7035     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7036     if (I0 == ExpectedVExtractIdx)
7037       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7038     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7039       // Try to match the following dag sequence:
7040       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7041       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7042     } else
7043       CanFold = false;
7044
7045     ExpectedVExtractIdx += 2;
7046   }
7047
7048   return CanFold;
7049 }
7050
7051 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7052 /// a concat_vector.
7053 ///
7054 /// This is a helper function of LowerToHorizontalOp().
7055 /// This function expects two 256-bit vectors called V0 and V1.
7056 /// At first, each vector is split into two separate 128-bit vectors.
7057 /// Then, the resulting 128-bit vectors are used to implement two
7058 /// horizontal binary operations.
7059 ///
7060 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7061 ///
7062 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7063 /// the two new horizontal binop.
7064 /// When Mode is set, the first horizontal binop dag node would take as input
7065 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7066 /// horizontal binop dag node would take as input the lower 128-bit of V1
7067 /// and the upper 128-bit of V1.
7068 ///   Example:
7069 ///     HADD V0_LO, V0_HI
7070 ///     HADD V1_LO, V1_HI
7071 ///
7072 /// Otherwise, the first horizontal binop dag node takes as input the lower
7073 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7074 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7075 ///   Example:
7076 ///     HADD V0_LO, V1_LO
7077 ///     HADD V0_HI, V1_HI
7078 ///
7079 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7080 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7081 /// the upper 128-bits of the result.
7082 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7083                                      const SDLoc &DL, SelectionDAG &DAG,
7084                                      unsigned X86Opcode, bool Mode,
7085                                      bool isUndefLO, bool isUndefHI) {
7086   MVT VT = V0.getSimpleValueType();
7087   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7088          "Invalid nodes in input!");
7089
7090   unsigned NumElts = VT.getVectorNumElements();
7091   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7092   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7093   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7094   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7095   MVT NewVT = V0_LO.getSimpleValueType();
7096
7097   SDValue LO = DAG.getUNDEF(NewVT);
7098   SDValue HI = DAG.getUNDEF(NewVT);
7099
7100   if (Mode) {
7101     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7102     if (!isUndefLO && !V0->isUndef())
7103       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7104     if (!isUndefHI && !V1->isUndef())
7105       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7106   } else {
7107     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7108     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7109       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7110
7111     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7112       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7113   }
7114
7115   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7116 }
7117
7118 /// Returns true iff \p BV builds a vector with the result equivalent to
7119 /// the result of ADDSUB operation.
7120 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7121 /// are written to the parameters \p Opnd0 and \p Opnd1.
7122 static bool isAddSub(const BuildVectorSDNode *BV,
7123                      const X86Subtarget &Subtarget, SelectionDAG &DAG,
7124                      SDValue &Opnd0, SDValue &Opnd1) {
7125
7126   MVT VT = BV->getSimpleValueType(0);
7127   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7128       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7129       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7130     return false;
7131
7132   unsigned NumElts = VT.getVectorNumElements();
7133   SDValue InVec0 = DAG.getUNDEF(VT);
7134   SDValue InVec1 = DAG.getUNDEF(VT);
7135
7136   // Odd-numbered elements in the input build vector are obtained from
7137   // adding two integer/float elements.
7138   // Even-numbered elements in the input build vector are obtained from
7139   // subtracting two integer/float elements.
7140   unsigned ExpectedOpcode = ISD::FSUB;
7141   unsigned NextExpectedOpcode = ISD::FADD;
7142   bool AddFound = false;
7143   bool SubFound = false;
7144
7145   for (unsigned i = 0, e = NumElts; i != e; ++i) {
7146     SDValue Op = BV->getOperand(i);
7147
7148     // Skip 'undef' values.
7149     unsigned Opcode = Op.getOpcode();
7150     if (Opcode == ISD::UNDEF) {
7151       std::swap(ExpectedOpcode, NextExpectedOpcode);
7152       continue;
7153     }
7154
7155     // Early exit if we found an unexpected opcode.
7156     if (Opcode != ExpectedOpcode)
7157       return false;
7158
7159     SDValue Op0 = Op.getOperand(0);
7160     SDValue Op1 = Op.getOperand(1);
7161
7162     // Try to match the following pattern:
7163     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7164     // Early exit if we cannot match that sequence.
7165     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7166         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7167         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7168         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7169         Op0.getOperand(1) != Op1.getOperand(1))
7170       return false;
7171
7172     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7173     if (I0 != i)
7174       return false;
7175
7176     // We found a valid add/sub node. Update the information accordingly.
7177     if (i & 1)
7178       AddFound = true;
7179     else
7180       SubFound = true;
7181
7182     // Update InVec0 and InVec1.
7183     if (InVec0.isUndef()) {
7184       InVec0 = Op0.getOperand(0);
7185       if (InVec0.getSimpleValueType() != VT)
7186         return false;
7187     }
7188     if (InVec1.isUndef()) {
7189       InVec1 = Op1.getOperand(0);
7190       if (InVec1.getSimpleValueType() != VT)
7191         return false;
7192     }
7193
7194     // Make sure that operands in input to each add/sub node always
7195     // come from a same pair of vectors.
7196     if (InVec0 != Op0.getOperand(0)) {
7197       if (ExpectedOpcode == ISD::FSUB)
7198         return false;
7199
7200       // FADD is commutable. Try to commute the operands
7201       // and then test again.
7202       std::swap(Op0, Op1);
7203       if (InVec0 != Op0.getOperand(0))
7204         return false;
7205     }
7206
7207     if (InVec1 != Op1.getOperand(0))
7208       return false;
7209
7210     // Update the pair of expected opcodes.
7211     std::swap(ExpectedOpcode, NextExpectedOpcode);
7212   }
7213
7214   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7215   if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7216     return false;
7217
7218   Opnd0 = InVec0;
7219   Opnd1 = InVec1;
7220   return true;
7221 }
7222
7223 /// Returns true if is possible to fold MUL and an idiom that has already been
7224 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7225 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7226 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7227 ///
7228 /// Prior to calling this function it should be known that there is some
7229 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7230 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7231 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7232 /// of \p Opnd0 uses is expected to be equal to 2.
7233 /// For example, this function may be called for the following IR:
7234 ///    %AB = fmul fast <2 x double> %A, %B
7235 ///    %Sub = fsub fast <2 x double> %AB, %C
7236 ///    %Add = fadd fast <2 x double> %AB, %C
7237 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7238 ///                            <2 x i32> <i32 0, i32 3>
7239 /// There is a def for %Addsub here, which potentially can be replaced by
7240 /// X86ISD::ADDSUB operation:
7241 ///    %Addsub = X86ISD::ADDSUB %AB, %C
7242 /// and such ADDSUB can further be replaced with FMADDSUB:
7243 ///    %Addsub = FMADDSUB %A, %B, %C.
7244 ///
7245 /// The main reason why this method is called before the replacement of the
7246 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7247 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7248 /// FMADDSUB is.
7249 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7250                        SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7251   if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7252       !Subtarget.hasAnyFMA())
7253     return false;
7254
7255   // FIXME: These checks must match the similar ones in
7256   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7257   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7258   // or MUL + ADDSUB to FMADDSUB.
7259   const TargetOptions &Options = DAG.getTarget().Options;
7260   bool AllowFusion =
7261       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7262   if (!AllowFusion)
7263     return false;
7264
7265   Opnd2 = Opnd1;
7266   Opnd1 = Opnd0.getOperand(1);
7267   Opnd0 = Opnd0.getOperand(0);
7268
7269   return true;
7270 }
7271
7272 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7273 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7274 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7275                                        const X86Subtarget &Subtarget,
7276                                        SelectionDAG &DAG) {
7277   SDValue Opnd0, Opnd1;
7278   if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7279     return SDValue();
7280
7281   MVT VT = BV->getSimpleValueType(0);
7282   SDLoc DL(BV);
7283
7284   // Try to generate X86ISD::FMADDSUB node here.
7285   SDValue Opnd2;
7286   if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7287     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7288
7289   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7290   // the ADDSUB idiom has been successfully recognized. There are no known
7291   // X86 targets with 512-bit ADDSUB instructions!
7292   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7293   // recognition.
7294   if (VT.is512BitVector())
7295     return SDValue();
7296
7297   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7298 }
7299
7300 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7301 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7302                                    const X86Subtarget &Subtarget,
7303                                    SelectionDAG &DAG) {
7304   MVT VT = BV->getSimpleValueType(0);
7305   unsigned NumElts = VT.getVectorNumElements();
7306   unsigned NumUndefsLO = 0;
7307   unsigned NumUndefsHI = 0;
7308   unsigned Half = NumElts/2;
7309
7310   // Count the number of UNDEF operands in the build_vector in input.
7311   for (unsigned i = 0, e = Half; i != e; ++i)
7312     if (BV->getOperand(i)->isUndef())
7313       NumUndefsLO++;
7314
7315   for (unsigned i = Half, e = NumElts; i != e; ++i)
7316     if (BV->getOperand(i)->isUndef())
7317       NumUndefsHI++;
7318
7319   // Early exit if this is either a build_vector of all UNDEFs or all the
7320   // operands but one are UNDEF.
7321   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7322     return SDValue();
7323
7324   SDLoc DL(BV);
7325   SDValue InVec0, InVec1;
7326   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7327     // Try to match an SSE3 float HADD/HSUB.
7328     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7329       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7330
7331     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7332       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7333   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7334     // Try to match an SSSE3 integer HADD/HSUB.
7335     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7336       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7337
7338     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7339       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7340   }
7341
7342   if (!Subtarget.hasAVX())
7343     return SDValue();
7344
7345   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7346     // Try to match an AVX horizontal add/sub of packed single/double
7347     // precision floating point values from 256-bit vectors.
7348     SDValue InVec2, InVec3;
7349     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7350         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7351         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7352         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7353       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7354
7355     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7356         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7357         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7358         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7359       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7360   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7361     // Try to match an AVX2 horizontal add/sub of signed integers.
7362     SDValue InVec2, InVec3;
7363     unsigned X86Opcode;
7364     bool CanFold = true;
7365
7366     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7367         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7368         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7369         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7370       X86Opcode = X86ISD::HADD;
7371     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7372         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7373         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7374         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7375       X86Opcode = X86ISD::HSUB;
7376     else
7377       CanFold = false;
7378
7379     if (CanFold) {
7380       // Fold this build_vector into a single horizontal add/sub.
7381       // Do this only if the target has AVX2.
7382       if (Subtarget.hasAVX2())
7383         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7384
7385       // Do not try to expand this build_vector into a pair of horizontal
7386       // add/sub if we can emit a pair of scalar add/sub.
7387       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7388         return SDValue();
7389
7390       // Convert this build_vector into a pair of horizontal binop followed by
7391       // a concat vector.
7392       bool isUndefLO = NumUndefsLO == Half;
7393       bool isUndefHI = NumUndefsHI == Half;
7394       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7395                                    isUndefLO, isUndefHI);
7396     }
7397   }
7398
7399   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7400        VT == MVT::v16i16) && Subtarget.hasAVX()) {
7401     unsigned X86Opcode;
7402     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7403       X86Opcode = X86ISD::HADD;
7404     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7405       X86Opcode = X86ISD::HSUB;
7406     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7407       X86Opcode = X86ISD::FHADD;
7408     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7409       X86Opcode = X86ISD::FHSUB;
7410     else
7411       return SDValue();
7412
7413     // Don't try to expand this build_vector into a pair of horizontal add/sub
7414     // if we can simply emit a pair of scalar add/sub.
7415     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7416       return SDValue();
7417
7418     // Convert this build_vector into two horizontal add/sub followed by
7419     // a concat vector.
7420     bool isUndefLO = NumUndefsLO == Half;
7421     bool isUndefHI = NumUndefsHI == Half;
7422     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7423                                  isUndefLO, isUndefHI);
7424   }
7425
7426   return SDValue();
7427 }
7428
7429 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7430 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7431 /// just apply the bit to the vectors.
7432 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7433 /// from this, but enough scalar bit operations are created from the later
7434 /// legalization + scalarization stages to need basic support.
7435 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7436                                        SelectionDAG &DAG) {
7437   SDLoc DL(Op);
7438   MVT VT = Op->getSimpleValueType(0);
7439   unsigned NumElems = VT.getVectorNumElements();
7440   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7441
7442   // Check that all elements have the same opcode.
7443   // TODO: Should we allow UNDEFS and if so how many?
7444   unsigned Opcode = Op->getOperand(0).getOpcode();
7445   for (unsigned i = 1; i < NumElems; ++i)
7446     if (Opcode != Op->getOperand(i).getOpcode())
7447       return SDValue();
7448
7449   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7450   switch (Opcode) {
7451   default:
7452     return SDValue();
7453   case ISD::AND:
7454   case ISD::XOR:
7455   case ISD::OR:
7456     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7457       return SDValue();
7458     break;
7459   }
7460
7461   SmallVector<SDValue, 4> LHSElts, RHSElts;
7462   for (SDValue Elt : Op->ops()) {
7463     SDValue LHS = Elt.getOperand(0);
7464     SDValue RHS = Elt.getOperand(1);
7465
7466     // We expect the canonicalized RHS operand to be the constant.
7467     if (!isa<ConstantSDNode>(RHS))
7468       return SDValue();
7469     LHSElts.push_back(LHS);
7470     RHSElts.push_back(RHS);
7471   }
7472
7473   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7474   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7475   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7476 }
7477
7478 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7479 /// functionality to do this, so it's all zeros, all ones, or some derivation
7480 /// that is cheap to calculate.
7481 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7482                                          const X86Subtarget &Subtarget) {
7483   SDLoc DL(Op);
7484   MVT VT = Op.getSimpleValueType();
7485
7486   // Vectors containing all zeros can be matched by pxor and xorps.
7487   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7488     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7489     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7490     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7491       return Op;
7492
7493     return getZeroVector(VT, Subtarget, DAG, DL);
7494   }
7495
7496   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7497   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7498   // vpcmpeqd on 256-bit vectors.
7499   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7500     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7501         (VT == MVT::v8i32 && Subtarget.hasInt256()))
7502       return Op;
7503
7504     return getOnesVector(VT, DAG, DL);
7505   }
7506
7507   return SDValue();
7508 }
7509
7510 SDValue
7511 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7512   SDLoc dl(Op);
7513
7514   MVT VT = Op.getSimpleValueType();
7515   MVT ExtVT = VT.getVectorElementType();
7516   unsigned NumElems = Op.getNumOperands();
7517
7518   // Generate vectors for predicate vectors.
7519   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7520     return LowerBUILD_VECTORvXi1(Op, DAG);
7521
7522   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7523     return VectorConstant;
7524
7525   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7526   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7527     return AddSub;
7528   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7529     return HorizontalOp;
7530   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7531     return Broadcast;
7532   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7533     return BitOp;
7534
7535   unsigned EVTBits = ExtVT.getSizeInBits();
7536
7537   unsigned NumZero  = 0;
7538   unsigned NumNonZero = 0;
7539   uint64_t NonZeros = 0;
7540   bool IsAllConstants = true;
7541   SmallSet<SDValue, 8> Values;
7542   for (unsigned i = 0; i < NumElems; ++i) {
7543     SDValue Elt = Op.getOperand(i);
7544     if (Elt.isUndef())
7545       continue;
7546     Values.insert(Elt);
7547     if (Elt.getOpcode() != ISD::Constant &&
7548         Elt.getOpcode() != ISD::ConstantFP)
7549       IsAllConstants = false;
7550     if (X86::isZeroNode(Elt))
7551       NumZero++;
7552     else {
7553       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7554       NonZeros |= ((uint64_t)1 << i);
7555       NumNonZero++;
7556     }
7557   }
7558
7559   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
7560   if (NumNonZero == 0)
7561     return DAG.getUNDEF(VT);
7562
7563   // Special case for single non-zero, non-undef, element.
7564   if (NumNonZero == 1) {
7565     unsigned Idx = countTrailingZeros(NonZeros);
7566     SDValue Item = Op.getOperand(Idx);
7567
7568     // If this is an insertion of an i64 value on x86-32, and if the top bits of
7569     // the value are obviously zero, truncate the value to i32 and do the
7570     // insertion that way.  Only do this if the value is non-constant or if the
7571     // value is a constant being inserted into element 0.  It is cheaper to do
7572     // a constant pool load than it is to do a movd + shuffle.
7573     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7574         (!IsAllConstants || Idx == 0)) {
7575       if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7576         // Handle SSE only.
7577         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7578         MVT VecVT = MVT::v4i32;
7579
7580         // Truncate the value (which may itself be a constant) to i32, and
7581         // convert it to a vector with movd (S2V+shuffle to zero extend).
7582         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7583         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7584         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7585                                       Item, Idx * 2, true, Subtarget, DAG));
7586       }
7587     }
7588
7589     // If we have a constant or non-constant insertion into the low element of
7590     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7591     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
7592     // depending on what the source datatype is.
7593     if (Idx == 0) {
7594       if (NumZero == 0)
7595         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7596
7597       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7598           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7599         assert((VT.is128BitVector() || VT.is256BitVector() ||
7600                 VT.is512BitVector()) &&
7601                "Expected an SSE value type!");
7602         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7603         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7604         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7605       }
7606
7607       // We can't directly insert an i8 or i16 into a vector, so zero extend
7608       // it to i32 first.
7609       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7610         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7611         if (VT.getSizeInBits() >= 256) {
7612           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7613           if (Subtarget.hasAVX()) {
7614             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7615             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7616           } else {
7617             // Without AVX, we need to extend to a 128-bit vector and then
7618             // insert into the 256-bit vector.
7619             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7620             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7621             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7622           }
7623         } else {
7624           assert(VT.is128BitVector() && "Expected an SSE value type!");
7625           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7626           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7627         }
7628         return DAG.getBitcast(VT, Item);
7629       }
7630     }
7631
7632     // Is it a vector logical left shift?
7633     if (NumElems == 2 && Idx == 1 &&
7634         X86::isZeroNode(Op.getOperand(0)) &&
7635         !X86::isZeroNode(Op.getOperand(1))) {
7636       unsigned NumBits = VT.getSizeInBits();
7637       return getVShift(true, VT,
7638                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7639                                    VT, Op.getOperand(1)),
7640                        NumBits/2, DAG, *this, dl);
7641     }
7642
7643     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7644       return SDValue();
7645
7646     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7647     // is a non-constant being inserted into an element other than the low one,
7648     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7649     // movd/movss) to move this into the low element, then shuffle it into
7650     // place.
7651     if (EVTBits == 32) {
7652       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7653       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7654     }
7655   }
7656
7657   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7658   if (Values.size() == 1) {
7659     if (EVTBits == 32) {
7660       // Instead of a shuffle like this:
7661       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7662       // Check if it's possible to issue this instead.
7663       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7664       unsigned Idx = countTrailingZeros(NonZeros);
7665       SDValue Item = Op.getOperand(Idx);
7666       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7667         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7668     }
7669     return SDValue();
7670   }
7671
7672   // A vector full of immediates; various special cases are already
7673   // handled, so this is best done with a single constant-pool load.
7674   if (IsAllConstants)
7675     return SDValue();
7676
7677   // See if we can use a vector load to get all of the elements.
7678   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7679     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7680     if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7681       return LD;
7682   }
7683
7684   // For AVX-length vectors, build the individual 128-bit pieces and use
7685   // shuffles to put them in place.
7686   if (VT.is256BitVector() || VT.is512BitVector()) {
7687     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7688
7689     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7690
7691     // Build both the lower and upper subvector.
7692     SDValue Lower =
7693         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7694     SDValue Upper = DAG.getBuildVector(
7695         HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7696
7697     // Recreate the wider vector with the lower and upper part.
7698     if (VT.is256BitVector())
7699       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7700     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7701   }
7702
7703   // Let legalizer expand 2-wide build_vectors.
7704   if (EVTBits == 64) {
7705     if (NumNonZero == 1) {
7706       // One half is zero or undef.
7707       unsigned Idx = countTrailingZeros(NonZeros);
7708       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7709                                Op.getOperand(Idx));
7710       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7711     }
7712     return SDValue();
7713   }
7714
7715   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7716   if (EVTBits == 8 && NumElems == 16)
7717     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7718                                           DAG, Subtarget))
7719       return V;
7720
7721   if (EVTBits == 16 && NumElems == 8)
7722     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7723                                           DAG, Subtarget))
7724       return V;
7725
7726   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7727   if (EVTBits == 32 && NumElems == 4)
7728     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7729       return V;
7730
7731   // If element VT is == 32 bits, turn it into a number of shuffles.
7732   if (NumElems == 4 && NumZero > 0) {
7733     SmallVector<SDValue, 8> Ops(NumElems);
7734     for (unsigned i = 0; i < 4; ++i) {
7735       bool isZero = !(NonZeros & (1ULL << i));
7736       if (isZero)
7737         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7738       else
7739         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7740     }
7741
7742     for (unsigned i = 0; i < 2; ++i) {
7743       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7744         default: break;
7745         case 0:
7746           Ops[i] = Ops[i*2];  // Must be a zero vector.
7747           break;
7748         case 1:
7749           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7750           break;
7751         case 2:
7752           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7753           break;
7754         case 3:
7755           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7756           break;
7757       }
7758     }
7759
7760     bool Reverse1 = (NonZeros & 0x3) == 2;
7761     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7762     int MaskVec[] = {
7763       Reverse1 ? 1 : 0,
7764       Reverse1 ? 0 : 1,
7765       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7766       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7767     };
7768     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7769   }
7770
7771   if (Values.size() > 1 && VT.is128BitVector()) {
7772     // Check for a build vector from mostly shuffle plus few inserting.
7773     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7774       return Sh;
7775
7776     // For SSE 4.1, use insertps to put the high elements into the low element.
7777     if (Subtarget.hasSSE41()) {
7778       SDValue Result;
7779       if (!Op.getOperand(0).isUndef())
7780         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7781       else
7782         Result = DAG.getUNDEF(VT);
7783
7784       for (unsigned i = 1; i < NumElems; ++i) {
7785         if (Op.getOperand(i).isUndef()) continue;
7786         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7787                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7788       }
7789       return Result;
7790     }
7791
7792     // Otherwise, expand into a number of unpckl*, start by extending each of
7793     // our (non-undef) elements to the full vector width with the element in the
7794     // bottom slot of the vector (which generates no code for SSE).
7795     SmallVector<SDValue, 8> Ops(NumElems);
7796     for (unsigned i = 0; i < NumElems; ++i) {
7797       if (!Op.getOperand(i).isUndef())
7798         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7799       else
7800         Ops[i] = DAG.getUNDEF(VT);
7801     }
7802
7803     // Next, we iteratively mix elements, e.g. for v4f32:
7804     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7805     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7806     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
7807     unsigned EltStride = NumElems >> 1;
7808     while (EltStride != 0) {
7809       for (unsigned i = 0; i < EltStride; ++i) {
7810         // If Ops[i+EltStride] is undef and this is the first round of mixing,
7811         // then it is safe to just drop this shuffle: V[i] is already in the
7812         // right place, the one element (since it's the first round) being
7813         // inserted as undef can be dropped.  This isn't safe for successive
7814         // rounds because they will permute elements within both vectors.
7815         if (Ops[i+EltStride].isUndef() &&
7816             EltStride == NumElems/2)
7817           continue;
7818
7819         Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7820       }
7821       EltStride >>= 1;
7822     }
7823     return Ops[0];
7824   }
7825   return SDValue();
7826 }
7827
7828 // 256-bit AVX can use the vinsertf128 instruction
7829 // to create 256-bit vectors from two other 128-bit ones.
7830 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7831   SDLoc dl(Op);
7832   MVT ResVT = Op.getSimpleValueType();
7833
7834   assert((ResVT.is256BitVector() ||
7835           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7836
7837   SDValue V1 = Op.getOperand(0);
7838   SDValue V2 = Op.getOperand(1);
7839   unsigned NumElems = ResVT.getVectorNumElements();
7840   if (ResVT.is256BitVector())
7841     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7842
7843   if (Op.getNumOperands() == 4) {
7844     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7845                                   ResVT.getVectorNumElements()/2);
7846     SDValue V3 = Op.getOperand(2);
7847     SDValue V4 = Op.getOperand(3);
7848     return concat256BitVectors(
7849         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7850         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7851         NumElems, DAG, dl);
7852   }
7853   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7854 }
7855
7856 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7857                                        const X86Subtarget &Subtarget,
7858                                        SelectionDAG & DAG) {
7859   SDLoc dl(Op);
7860   MVT ResVT = Op.getSimpleValueType();
7861   unsigned NumOfOperands = Op.getNumOperands();
7862
7863   assert(isPowerOf2_32(NumOfOperands) &&
7864          "Unexpected number of operands in CONCAT_VECTORS");
7865
7866   SDValue Undef = DAG.getUNDEF(ResVT);
7867   if (NumOfOperands > 2) {
7868     // Specialize the cases when all, or all but one, of the operands are undef.
7869     unsigned NumOfDefinedOps = 0;
7870     unsigned OpIdx = 0;
7871     for (unsigned i = 0; i < NumOfOperands; i++)
7872       if (!Op.getOperand(i).isUndef()) {
7873         NumOfDefinedOps++;
7874         OpIdx = i;
7875       }
7876     if (NumOfDefinedOps == 0)
7877       return Undef;
7878     if (NumOfDefinedOps == 1) {
7879       unsigned SubVecNumElts =
7880         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7881       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7882       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7883                          Op.getOperand(OpIdx), IdxVal);
7884     }
7885
7886     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7887                                   ResVT.getVectorNumElements()/2);
7888     SmallVector<SDValue, 2> Ops;
7889     for (unsigned i = 0; i < NumOfOperands/2; i++)
7890       Ops.push_back(Op.getOperand(i));
7891     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7892     Ops.clear();
7893     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7894       Ops.push_back(Op.getOperand(i));
7895     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7896     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7897   }
7898
7899   // 2 operands
7900   SDValue V1 = Op.getOperand(0);
7901   SDValue V2 = Op.getOperand(1);
7902   unsigned NumElems = ResVT.getVectorNumElements();
7903   assert(V1.getValueType() == V2.getValueType() &&
7904          V1.getValueType().getVectorNumElements() == NumElems/2 &&
7905          "Unexpected operands in CONCAT_VECTORS");
7906
7907   if (ResVT.getSizeInBits() >= 16)
7908     return Op; // The operation is legal with KUNPCK
7909
7910   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7911   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7912   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7913   if (IsZeroV1 && IsZeroV2)
7914     return ZeroVec;
7915
7916   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7917   if (V2.isUndef())
7918     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7919   if (IsZeroV2)
7920     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7921
7922   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7923   if (V1.isUndef())
7924     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7925
7926   if (IsZeroV1)
7927     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7928
7929   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7930   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7931 }
7932
7933 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7934                                    const X86Subtarget &Subtarget,
7935                                    SelectionDAG &DAG) {
7936   MVT VT = Op.getSimpleValueType();
7937   if (VT.getVectorElementType() == MVT::i1)
7938     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7939
7940   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7941          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7942           Op.getNumOperands() == 4)));
7943
7944   // AVX can use the vinsertf128 instruction to create 256-bit vectors
7945   // from two other 128-bit ones.
7946
7947   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7948   return LowerAVXCONCAT_VECTORS(Op, DAG);
7949 }
7950
7951 //===----------------------------------------------------------------------===//
7952 // Vector shuffle lowering
7953 //
7954 // This is an experimental code path for lowering vector shuffles on x86. It is
7955 // designed to handle arbitrary vector shuffles and blends, gracefully
7956 // degrading performance as necessary. It works hard to recognize idiomatic
7957 // shuffles and lower them to optimal instruction patterns without leaving
7958 // a framework that allows reasonably efficient handling of all vector shuffle
7959 // patterns.
7960 //===----------------------------------------------------------------------===//
7961
7962 /// \brief Tiny helper function to identify a no-op mask.
7963 ///
7964 /// This is a somewhat boring predicate function. It checks whether the mask
7965 /// array input, which is assumed to be a single-input shuffle mask of the kind
7966 /// used by the X86 shuffle instructions (not a fully general
7967 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7968 /// in-place shuffle are 'no-op's.
7969 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7970   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7971     assert(Mask[i] >= -1 && "Out of bound mask element!");
7972     if (Mask[i] >= 0 && Mask[i] != i)
7973       return false;
7974   }
7975   return true;
7976 }
7977
7978 /// \brief Test whether there are elements crossing 128-bit lanes in this
7979 /// shuffle mask.
7980 ///
7981 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7982 /// and we routinely test for these.
7983 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7984   int LaneSize = 128 / VT.getScalarSizeInBits();
7985   int Size = Mask.size();
7986   for (int i = 0; i < Size; ++i)
7987     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7988       return true;
7989   return false;
7990 }
7991
7992 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7993 ///
7994 /// This checks a shuffle mask to see if it is performing the same
7995 /// lane-relative shuffle in each sub-lane. This trivially implies
7996 /// that it is also not lane-crossing. It may however involve a blend from the
7997 /// same lane of a second vector.
7998 ///
7999 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8000 /// non-trivial to compute in the face of undef lanes. The representation is
8001 /// suitable for use with existing 128-bit shuffles as entries from the second
8002 /// vector have been remapped to [LaneSize, 2*LaneSize).
8003 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8004                                   ArrayRef<int> Mask,
8005                                   SmallVectorImpl<int> &RepeatedMask) {
8006   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8007   RepeatedMask.assign(LaneSize, -1);
8008   int Size = Mask.size();
8009   for (int i = 0; i < Size; ++i) {
8010     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8011     if (Mask[i] < 0)
8012       continue;
8013     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8014       // This entry crosses lanes, so there is no way to model this shuffle.
8015       return false;
8016
8017     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8018     // Adjust second vector indices to start at LaneSize instead of Size.
8019     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8020                                 : Mask[i] % LaneSize + LaneSize;
8021     if (RepeatedMask[i % LaneSize] < 0)
8022       // This is the first non-undef entry in this slot of a 128-bit lane.
8023       RepeatedMask[i % LaneSize] = LocalM;
8024     else if (RepeatedMask[i % LaneSize] != LocalM)
8025       // Found a mismatch with the repeated mask.
8026       return false;
8027   }
8028   return true;
8029 }
8030
8031 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8032 static bool
8033 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8034                                 SmallVectorImpl<int> &RepeatedMask) {
8035   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8036 }
8037
8038 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8039 static bool
8040 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8041                                 SmallVectorImpl<int> &RepeatedMask) {
8042   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8043 }
8044
8045 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8046 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8047 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8048                                         ArrayRef<int> Mask,
8049                                         SmallVectorImpl<int> &RepeatedMask) {
8050   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8051   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8052   int Size = Mask.size();
8053   for (int i = 0; i < Size; ++i) {
8054     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8055     if (Mask[i] == SM_SentinelUndef)
8056       continue;
8057     if (Mask[i] == SM_SentinelZero) {
8058       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8059         return false;
8060       RepeatedMask[i % LaneSize] = SM_SentinelZero;
8061       continue;
8062     }
8063     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8064       // This entry crosses lanes, so there is no way to model this shuffle.
8065       return false;
8066
8067     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8068     // Adjust second vector indices to start at LaneSize instead of Size.
8069     int LocalM =
8070         Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8071     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8072       // This is the first non-undef entry in this slot of a 128-bit lane.
8073       RepeatedMask[i % LaneSize] = LocalM;
8074     else if (RepeatedMask[i % LaneSize] != LocalM)
8075       // Found a mismatch with the repeated mask.
8076       return false;
8077   }
8078   return true;
8079 }
8080
8081 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8082 /// arguments.
8083 ///
8084 /// This is a fast way to test a shuffle mask against a fixed pattern:
8085 ///
8086 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8087 ///
8088 /// It returns true if the mask is exactly as wide as the argument list, and
8089 /// each element of the mask is either -1 (signifying undef) or the value given
8090 /// in the argument.
8091 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8092                                 ArrayRef<int> ExpectedMask) {
8093   if (Mask.size() != ExpectedMask.size())
8094     return false;
8095
8096   int Size = Mask.size();
8097
8098   // If the values are build vectors, we can look through them to find
8099   // equivalent inputs that make the shuffles equivalent.
8100   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8101   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8102
8103   for (int i = 0; i < Size; ++i) {
8104     assert(Mask[i] >= -1 && "Out of bound mask element!");
8105     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8106       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8107       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8108       if (!MaskBV || !ExpectedBV ||
8109           MaskBV->getOperand(Mask[i] % Size) !=
8110               ExpectedBV->getOperand(ExpectedMask[i] % Size))
8111         return false;
8112     }
8113   }
8114
8115   return true;
8116 }
8117
8118 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8119 ///
8120 /// The masks must be exactly the same width.
8121 ///
8122 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8123 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8124 ///
8125 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8126 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8127                                       ArrayRef<int> ExpectedMask) {
8128   int Size = Mask.size();
8129   if (Size != (int)ExpectedMask.size())
8130     return false;
8131
8132   for (int i = 0; i < Size; ++i)
8133     if (Mask[i] == SM_SentinelUndef)
8134       continue;
8135     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8136       return false;
8137     else if (Mask[i] != ExpectedMask[i])
8138       return false;
8139
8140   return true;
8141 }
8142
8143 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8144 // mask.
8145 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8146                                                     const APInt &Zeroable) {
8147   int NumElts = Mask.size();
8148   assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8149
8150   SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8151   for (int i = 0; i != NumElts; ++i) {
8152     int M = Mask[i];
8153     if (M == SM_SentinelUndef)
8154       continue;
8155     assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8156     TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8157   }
8158   return TargetMask;
8159 }
8160
8161 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8162 // instructions.
8163 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8164   if (VT != MVT::v8i32 && VT != MVT::v8f32)
8165     return false;
8166
8167   SmallVector<int, 8> Unpcklwd;
8168   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8169                           /* Unary = */ false);
8170   SmallVector<int, 8> Unpckhwd;
8171   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8172                           /* Unary = */ false);
8173   bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8174                          isTargetShuffleEquivalent(Mask, Unpckhwd));
8175   return IsUnpackwdMask;
8176 }
8177
8178 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8179 ///
8180 /// This helper function produces an 8-bit shuffle immediate corresponding to
8181 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8182 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8183 /// example.
8184 ///
8185 /// NB: We rely heavily on "undef" masks preserving the input lane.
8186 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8187   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8188   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8189   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8190   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8191   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8192
8193   unsigned Imm = 0;
8194   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8195   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8196   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8197   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8198   return Imm;
8199 }
8200
8201 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8202                                           SelectionDAG &DAG) {
8203   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8204 }
8205
8206 /// \brief Compute whether each element of a shuffle is zeroable.
8207 ///
8208 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8209 /// Either it is an undef element in the shuffle mask, the element of the input
8210 /// referenced is undef, or the element of the input referenced is known to be
8211 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8212 /// as many lanes with this technique as possible to simplify the remaining
8213 /// shuffle.
8214 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8215                                             SDValue V1, SDValue V2) {
8216   APInt Zeroable(Mask.size(), 0);
8217   V1 = peekThroughBitcasts(V1);
8218   V2 = peekThroughBitcasts(V2);
8219
8220   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8221   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8222
8223   int VectorSizeInBits = V1.getValueSizeInBits();
8224   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8225   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8226
8227   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8228     int M = Mask[i];
8229     // Handle the easy cases.
8230     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8231       Zeroable.setBit(i);
8232       continue;
8233     }
8234
8235     // Determine shuffle input and normalize the mask.
8236     SDValue V = M < Size ? V1 : V2;
8237     M %= Size;
8238
8239     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8240     if (V.getOpcode() != ISD::BUILD_VECTOR)
8241       continue;
8242
8243     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8244     // the (larger) source element must be UNDEF/ZERO.
8245     if ((Size % V.getNumOperands()) == 0) {
8246       int Scale = Size / V->getNumOperands();
8247       SDValue Op = V.getOperand(M / Scale);
8248       if (Op.isUndef() || X86::isZeroNode(Op))
8249         Zeroable.setBit(i);
8250       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8251         APInt Val = Cst->getAPIntValue();
8252         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8253         Val = Val.getLoBits(ScalarSizeInBits);
8254         if (Val == 0)
8255           Zeroable.setBit(i);
8256       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8257         APInt Val = Cst->getValueAPF().bitcastToAPInt();
8258         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8259         Val = Val.getLoBits(ScalarSizeInBits);
8260         if (Val == 0)
8261           Zeroable.setBit(i);
8262       }
8263       continue;
8264     }
8265
8266     // If the BUILD_VECTOR has more elements then all the (smaller) source
8267     // elements must be UNDEF or ZERO.
8268     if ((V.getNumOperands() % Size) == 0) {
8269       int Scale = V->getNumOperands() / Size;
8270       bool AllZeroable = true;
8271       for (int j = 0; j < Scale; ++j) {
8272         SDValue Op = V.getOperand((M * Scale) + j);
8273         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8274       }
8275       if (AllZeroable)
8276         Zeroable.setBit(i);
8277       continue;
8278     }
8279   }
8280
8281   return Zeroable;
8282 }
8283
8284 // The Shuffle result is as follow:
8285 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8286 // Each Zeroable's element correspond to a particular Mask's element.
8287 // As described in computeZeroableShuffleElements function.
8288 //
8289 // The function looks for a sub-mask that the nonzero elements are in
8290 // increasing order. If such sub-mask exist. The function returns true.
8291 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8292                                      ArrayRef<int> Mask, const EVT &VectorType,
8293                                      bool &IsZeroSideLeft) {
8294   int NextElement = -1;
8295   // Check if the Mask's nonzero elements are in increasing order.
8296   for (int i = 0, e = Mask.size(); i < e; i++) {
8297     // Checks if the mask's zeros elements are built from only zeros.
8298     assert(Mask[i] >= -1 && "Out of bound mask element!");
8299     if (Mask[i] < 0)
8300       return false;
8301     if (Zeroable[i])
8302       continue;
8303     // Find the lowest non zero element
8304     if (NextElement < 0) {
8305       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8306       IsZeroSideLeft = NextElement != 0;
8307     }
8308     // Exit if the mask's non zero elements are not in increasing order.
8309     if (NextElement != Mask[i])
8310       return false;
8311     NextElement++;
8312   }
8313   return true;
8314 }
8315
8316 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8317 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8318                                             ArrayRef<int> Mask, SDValue V1,
8319                                             SDValue V2,
8320                                             const APInt &Zeroable,
8321                                             const X86Subtarget &Subtarget,
8322                                             SelectionDAG &DAG) {
8323   int Size = Mask.size();
8324   int LaneSize = 128 / VT.getScalarSizeInBits();
8325   const int NumBytes = VT.getSizeInBits() / 8;
8326   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8327
8328   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8329          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8330          (Subtarget.hasBWI() && VT.is512BitVector()));
8331
8332   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8333   // Sign bit set in i8 mask means zero element.
8334   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8335
8336   SDValue V;
8337   for (int i = 0; i < NumBytes; ++i) {
8338     int M = Mask[i / NumEltBytes];
8339     if (M < 0) {
8340       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8341       continue;
8342     }
8343     if (Zeroable[i / NumEltBytes]) {
8344       PSHUFBMask[i] = ZeroMask;
8345       continue;
8346     }
8347
8348     // We can only use a single input of V1 or V2.
8349     SDValue SrcV = (M >= Size ? V2 : V1);
8350     if (V && V != SrcV)
8351       return SDValue();
8352     V = SrcV;
8353     M %= Size;
8354
8355     // PSHUFB can't cross lanes, ensure this doesn't happen.
8356     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8357       return SDValue();
8358
8359     M = M % LaneSize;
8360     M = M * NumEltBytes + (i % NumEltBytes);
8361     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8362   }
8363   assert(V && "Failed to find a source input");
8364
8365   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8366   return DAG.getBitcast(
8367       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8368                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8369 }
8370
8371 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8372                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
8373                            const SDLoc &dl);
8374
8375 // X86 has dedicated shuffle that can be lowered to VEXPAND
8376 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8377                                           const APInt &Zeroable,
8378                                           ArrayRef<int> Mask, SDValue &V1,
8379                                           SDValue &V2, SelectionDAG &DAG,
8380                                           const X86Subtarget &Subtarget) {
8381   bool IsLeftZeroSide = true;
8382   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8383                                 IsLeftZeroSide))
8384     return SDValue();
8385   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8386   MVT IntegerType =
8387       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8388   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8389   unsigned NumElts = VT.getVectorNumElements();
8390   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8391          "Unexpected number of vector elements");
8392   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8393                               Subtarget, DAG, DL);
8394   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8395   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8396   return DAG.getSelect(DL, VT, VMask,
8397                        DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8398                        ZeroVector);
8399 }
8400
8401 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8402                                         unsigned &UnpackOpcode, bool IsUnary,
8403                                         ArrayRef<int> TargetMask, SDLoc &DL,
8404                                         SelectionDAG &DAG,
8405                                         const X86Subtarget &Subtarget) {
8406   int NumElts = VT.getVectorNumElements();
8407
8408   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8409   for (int i = 0; i != NumElts; i += 2) {
8410     int M1 = TargetMask[i + 0];
8411     int M2 = TargetMask[i + 1];
8412     Undef1 &= (SM_SentinelUndef == M1);
8413     Undef2 &= (SM_SentinelUndef == M2);
8414     Zero1 &= isUndefOrZero(M1);
8415     Zero2 &= isUndefOrZero(M2);
8416   }
8417   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8418          "Zeroable shuffle detected");
8419
8420   // Attempt to match the target mask against the unpack lo/hi mask patterns.
8421   SmallVector<int, 64> Unpckl, Unpckh;
8422   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8423   if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8424     UnpackOpcode = X86ISD::UNPCKL;
8425     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8426     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8427     return true;
8428   }
8429
8430   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8431   if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8432     UnpackOpcode = X86ISD::UNPCKH;
8433     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8434     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8435     return true;
8436   }
8437
8438   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8439   if (IsUnary && (Zero1 || Zero2)) {
8440     // Don't bother if we can blend instead.
8441     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8442         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8443       return false;
8444
8445     bool MatchLo = true, MatchHi = true;
8446     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8447       int M = TargetMask[i];
8448
8449       // Ignore if the input is known to be zero or the index is undef.
8450       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8451           (M == SM_SentinelUndef))
8452         continue;
8453
8454       MatchLo &= (M == Unpckl[i]);
8455       MatchHi &= (M == Unpckh[i]);
8456     }
8457
8458     if (MatchLo || MatchHi) {
8459       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8460       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8461       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8462       return true;
8463     }
8464   }
8465
8466   // If a binary shuffle, commute and try again.
8467   if (!IsUnary) {
8468     ShuffleVectorSDNode::commuteMask(Unpckl);
8469     if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8470       UnpackOpcode = X86ISD::UNPCKL;
8471       std::swap(V1, V2);
8472       return true;
8473     }
8474
8475     ShuffleVectorSDNode::commuteMask(Unpckh);
8476     if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8477       UnpackOpcode = X86ISD::UNPCKH;
8478       std::swap(V1, V2);
8479       return true;
8480     }
8481   }
8482
8483   return false;
8484 }
8485
8486 // X86 has dedicated unpack instructions that can handle specific blend
8487 // operations: UNPCKH and UNPCKL.
8488 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8489                                            ArrayRef<int> Mask, SDValue V1,
8490                                            SDValue V2, SelectionDAG &DAG) {
8491   SmallVector<int, 8> Unpckl;
8492   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8493   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8494     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8495
8496   SmallVector<int, 8> Unpckh;
8497   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8498   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8499     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8500
8501   // Commute and try again.
8502   ShuffleVectorSDNode::commuteMask(Unpckl);
8503   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8504     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8505
8506   ShuffleVectorSDNode::commuteMask(Unpckh);
8507   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8508     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8509
8510   return SDValue();
8511 }
8512
8513 /// \brief Try to emit a bitmask instruction for a shuffle.
8514 ///
8515 /// This handles cases where we can model a blend exactly as a bitmask due to
8516 /// one of the inputs being zeroable.
8517 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8518                                            SDValue V2, ArrayRef<int> Mask,
8519                                            const APInt &Zeroable,
8520                                            SelectionDAG &DAG) {
8521   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8522   MVT EltVT = VT.getVectorElementType();
8523   SDValue Zero = DAG.getConstant(0, DL, EltVT);
8524   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8525   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8526   SDValue V;
8527   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8528     if (Zeroable[i])
8529       continue;
8530     if (Mask[i] % Size != i)
8531       return SDValue(); // Not a blend.
8532     if (!V)
8533       V = Mask[i] < Size ? V1 : V2;
8534     else if (V != (Mask[i] < Size ? V1 : V2))
8535       return SDValue(); // Can only let one input through the mask.
8536
8537     VMaskOps[i] = AllOnes;
8538   }
8539   if (!V)
8540     return SDValue(); // No non-zeroable elements!
8541
8542   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8543   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8544 }
8545
8546 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8547 ///
8548 /// This is used as a fallback approach when first class blend instructions are
8549 /// unavailable. Currently it is only suitable for integer vectors, but could
8550 /// be generalized for floating point vectors if desirable.
8551 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8552                                             SDValue V2, ArrayRef<int> Mask,
8553                                             SelectionDAG &DAG) {
8554   assert(VT.isInteger() && "Only supports integer vector types!");
8555   MVT EltVT = VT.getVectorElementType();
8556   SDValue Zero = DAG.getConstant(0, DL, EltVT);
8557   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8558   SmallVector<SDValue, 16> MaskOps;
8559   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8560     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8561       return SDValue(); // Shuffled input!
8562     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8563   }
8564
8565   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8566   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8567   // We have to cast V2 around.
8568   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8569   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8570                                       DAG.getBitcast(MaskVT, V1Mask),
8571                                       DAG.getBitcast(MaskVT, V2)));
8572   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8573 }
8574
8575 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8576                                     SDValue PreservedSrc,
8577                                     const X86Subtarget &Subtarget,
8578                                     SelectionDAG &DAG);
8579
8580 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8581                                       MutableArrayRef<int> TargetMask,
8582                                       bool &ForceV1Zero, bool &ForceV2Zero,
8583                                       uint64_t &BlendMask) {
8584   bool V1IsZeroOrUndef =
8585       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8586   bool V2IsZeroOrUndef =
8587       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8588
8589   BlendMask = 0;
8590   ForceV1Zero = false, ForceV2Zero = false;
8591   assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8592
8593   // Attempt to generate the binary blend mask. If an input is zero then
8594   // we can use any lane.
8595   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8596   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8597     int M = TargetMask[i];
8598     if (M == SM_SentinelUndef)
8599       continue;
8600     if (M == i)
8601       continue;
8602     if (M == i + Size) {
8603       BlendMask |= 1ull << i;
8604       continue;
8605     }
8606     if (M == SM_SentinelZero) {
8607       if (V1IsZeroOrUndef) {
8608         ForceV1Zero = true;
8609         TargetMask[i] = i;
8610         continue;
8611       }
8612       if (V2IsZeroOrUndef) {
8613         ForceV2Zero = true;
8614         BlendMask |= 1ull << i;
8615         TargetMask[i] = i + Size;
8616         continue;
8617       }
8618     }
8619     return false;
8620   }
8621   return true;
8622 }
8623
8624 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8625   uint64_t ScaledMask = 0;
8626   for (int i = 0; i != Size; ++i)
8627     if (BlendMask & (1ull << i))
8628       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8629   return ScaledMask;
8630 }
8631
8632 /// \brief Try to emit a blend instruction for a shuffle.
8633 ///
8634 /// This doesn't do any checks for the availability of instructions for blending
8635 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8636 /// be matched in the backend with the type given. What it does check for is
8637 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8638 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8639                                          SDValue V2, ArrayRef<int> Original,
8640                                          const APInt &Zeroable,
8641                                          const X86Subtarget &Subtarget,
8642                                          SelectionDAG &DAG) {
8643   SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8644
8645   uint64_t BlendMask = 0;
8646   bool ForceV1Zero = false, ForceV2Zero = false;
8647   if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8648                                  BlendMask))
8649     return SDValue();
8650
8651   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8652   if (ForceV1Zero)
8653     V1 = getZeroVector(VT, Subtarget, DAG, DL);
8654   if (ForceV2Zero)
8655     V2 = getZeroVector(VT, Subtarget, DAG, DL);
8656
8657   switch (VT.SimpleTy) {
8658   case MVT::v2f64:
8659   case MVT::v4f32:
8660   case MVT::v4f64:
8661   case MVT::v8f32:
8662     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8663                        DAG.getConstant(BlendMask, DL, MVT::i8));
8664
8665   case MVT::v4i64:
8666   case MVT::v8i32:
8667     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8668     LLVM_FALLTHROUGH;
8669   case MVT::v2i64:
8670   case MVT::v4i32:
8671     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8672     // that instruction.
8673     if (Subtarget.hasAVX2()) {
8674       // Scale the blend by the number of 32-bit dwords per element.
8675       int Scale =  VT.getScalarSizeInBits() / 32;
8676       BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8677       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8678       V1 = DAG.getBitcast(BlendVT, V1);
8679       V2 = DAG.getBitcast(BlendVT, V2);
8680       return DAG.getBitcast(
8681           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8682                           DAG.getConstant(BlendMask, DL, MVT::i8)));
8683     }
8684     LLVM_FALLTHROUGH;
8685   case MVT::v8i16: {
8686     // For integer shuffles we need to expand the mask and cast the inputs to
8687     // v8i16s prior to blending.
8688     int Scale = 8 / VT.getVectorNumElements();
8689     BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8690     V1 = DAG.getBitcast(MVT::v8i16, V1);
8691     V2 = DAG.getBitcast(MVT::v8i16, V2);
8692     return DAG.getBitcast(VT,
8693                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8694                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
8695   }
8696
8697   case MVT::v16i16: {
8698     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8699     SmallVector<int, 8> RepeatedMask;
8700     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8701       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8702       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8703       BlendMask = 0;
8704       for (int i = 0; i < 8; ++i)
8705         if (RepeatedMask[i] >= 8)
8706           BlendMask |= 1ull << i;
8707       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8708                          DAG.getConstant(BlendMask, DL, MVT::i8));
8709     }
8710     LLVM_FALLTHROUGH;
8711   }
8712   case MVT::v16i8:
8713   case MVT::v32i8: {
8714     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8715            "256-bit byte-blends require AVX2 support!");
8716
8717     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8718       MVT IntegerType =
8719           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8720       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8721       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8722     }
8723
8724     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8725     if (SDValue Masked =
8726             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8727       return Masked;
8728
8729     // Scale the blend by the number of bytes per element.
8730     int Scale = VT.getScalarSizeInBits() / 8;
8731
8732     // This form of blend is always done on bytes. Compute the byte vector
8733     // type.
8734     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8735
8736     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8737     // mix of LLVM's code generator and the x86 backend. We tell the code
8738     // generator that boolean values in the elements of an x86 vector register
8739     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8740     // mapping a select to operand #1, and 'false' mapping to operand #2. The
8741     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8742     // of the element (the remaining are ignored) and 0 in that high bit would
8743     // mean operand #1 while 1 in the high bit would mean operand #2. So while
8744     // the LLVM model for boolean values in vector elements gets the relevant
8745     // bit set, it is set backwards and over constrained relative to x86's
8746     // actual model.
8747     SmallVector<SDValue, 32> VSELECTMask;
8748     for (int i = 0, Size = Mask.size(); i < Size; ++i)
8749       for (int j = 0; j < Scale; ++j)
8750         VSELECTMask.push_back(
8751             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8752                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8753                                           MVT::i8));
8754
8755     V1 = DAG.getBitcast(BlendVT, V1);
8756     V2 = DAG.getBitcast(BlendVT, V2);
8757     return DAG.getBitcast(
8758         VT,
8759         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8760                       V1, V2));
8761   }
8762   case MVT::v16f32:
8763   case MVT::v8f64:
8764   case MVT::v8i64:
8765   case MVT::v16i32:
8766   case MVT::v32i16:
8767   case MVT::v64i8: {
8768     MVT IntegerType =
8769         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8770     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8771     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8772   }
8773   default:
8774     llvm_unreachable("Not a supported integer vector type!");
8775   }
8776 }
8777
8778 /// \brief Try to lower as a blend of elements from two inputs followed by
8779 /// a single-input permutation.
8780 ///
8781 /// This matches the pattern where we can blend elements from two inputs and
8782 /// then reduce the shuffle to a single-input permutation.
8783 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8784                                                    SDValue V1, SDValue V2,
8785                                                    ArrayRef<int> Mask,
8786                                                    SelectionDAG &DAG) {
8787   // We build up the blend mask while checking whether a blend is a viable way
8788   // to reduce the shuffle.
8789   SmallVector<int, 32> BlendMask(Mask.size(), -1);
8790   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8791
8792   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8793     if (Mask[i] < 0)
8794       continue;
8795
8796     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8797
8798     if (BlendMask[Mask[i] % Size] < 0)
8799       BlendMask[Mask[i] % Size] = Mask[i];
8800     else if (BlendMask[Mask[i] % Size] != Mask[i])
8801       return SDValue(); // Can't blend in the needed input!
8802
8803     PermuteMask[i] = Mask[i] % Size;
8804   }
8805
8806   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8807   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8808 }
8809
8810 /// \brief Generic routine to decompose a shuffle and blend into independent
8811 /// blends and permutes.
8812 ///
8813 /// This matches the extremely common pattern for handling combined
8814 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8815 /// operations. It will try to pick the best arrangement of shuffles and
8816 /// blends.
8817 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8818                                                           MVT VT, SDValue V1,
8819                                                           SDValue V2,
8820                                                           ArrayRef<int> Mask,
8821                                                           SelectionDAG &DAG) {
8822   // Shuffle the input elements into the desired positions in V1 and V2 and
8823   // blend them together.
8824   SmallVector<int, 32> V1Mask(Mask.size(), -1);
8825   SmallVector<int, 32> V2Mask(Mask.size(), -1);
8826   SmallVector<int, 32> BlendMask(Mask.size(), -1);
8827   for (int i = 0, Size = Mask.size(); i < Size; ++i)
8828     if (Mask[i] >= 0 && Mask[i] < Size) {
8829       V1Mask[i] = Mask[i];
8830       BlendMask[i] = i;
8831     } else if (Mask[i] >= Size) {
8832       V2Mask[i] = Mask[i] - Size;
8833       BlendMask[i] = i + Size;
8834     }
8835
8836   // Try to lower with the simpler initial blend strategy unless one of the
8837   // input shuffles would be a no-op. We prefer to shuffle inputs as the
8838   // shuffle may be able to fold with a load or other benefit. However, when
8839   // we'll have to do 2x as many shuffles in order to achieve this, blending
8840   // first is a better strategy.
8841   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8842     if (SDValue BlendPerm =
8843             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8844       return BlendPerm;
8845
8846   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8847   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8848   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8849 }
8850
8851 /// \brief Try to lower a vector shuffle as a rotation.
8852 ///
8853 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8854 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8855                                       ArrayRef<int> Mask) {
8856   int NumElts = Mask.size();
8857
8858   // We need to detect various ways of spelling a rotation:
8859   //   [11, 12, 13, 14, 15,  0,  1,  2]
8860   //   [-1, 12, 13, 14, -1, -1,  1, -1]
8861   //   [-1, -1, -1, -1, -1, -1,  1,  2]
8862   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
8863   //   [-1,  4,  5,  6, -1, -1,  9, -1]
8864   //   [-1,  4,  5,  6, -1, -1, -1, -1]
8865   int Rotation = 0;
8866   SDValue Lo, Hi;
8867   for (int i = 0; i < NumElts; ++i) {
8868     int M = Mask[i];
8869     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8870            "Unexpected mask index.");
8871     if (M < 0)
8872       continue;
8873
8874     // Determine where a rotated vector would have started.
8875     int StartIdx = i - (M % NumElts);
8876     if (StartIdx == 0)
8877       // The identity rotation isn't interesting, stop.
8878       return -1;
8879
8880     // If we found the tail of a vector the rotation must be the missing
8881     // front. If we found the head of a vector, it must be how much of the
8882     // head.
8883     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8884
8885     if (Rotation == 0)
8886       Rotation = CandidateRotation;
8887     else if (Rotation != CandidateRotation)
8888       // The rotations don't match, so we can't match this mask.
8889       return -1;
8890
8891     // Compute which value this mask is pointing at.
8892     SDValue MaskV = M < NumElts ? V1 : V2;
8893
8894     // Compute which of the two target values this index should be assigned
8895     // to. This reflects whether the high elements are remaining or the low
8896     // elements are remaining.
8897     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8898
8899     // Either set up this value if we've not encountered it before, or check
8900     // that it remains consistent.
8901     if (!TargetV)
8902       TargetV = MaskV;
8903     else if (TargetV != MaskV)
8904       // This may be a rotation, but it pulls from the inputs in some
8905       // unsupported interleaving.
8906       return -1;
8907   }
8908
8909   // Check that we successfully analyzed the mask, and normalize the results.
8910   assert(Rotation != 0 && "Failed to locate a viable rotation!");
8911   assert((Lo || Hi) && "Failed to find a rotated input vector!");
8912   if (!Lo)
8913     Lo = Hi;
8914   else if (!Hi)
8915     Hi = Lo;
8916
8917   V1 = Lo;
8918   V2 = Hi;
8919
8920   return Rotation;
8921 }
8922
8923 /// \brief Try to lower a vector shuffle as a byte rotation.
8924 ///
8925 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8926 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8927 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8928 /// try to generically lower a vector shuffle through such an pattern. It
8929 /// does not check for the profitability of lowering either as PALIGNR or
8930 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8931 /// This matches shuffle vectors that look like:
8932 ///
8933 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8934 ///
8935 /// Essentially it concatenates V1 and V2, shifts right by some number of
8936 /// elements, and takes the low elements as the result. Note that while this is
8937 /// specified as a *right shift* because x86 is little-endian, it is a *left
8938 /// rotate* of the vector lanes.
8939 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8940                                           ArrayRef<int> Mask) {
8941   // Don't accept any shuffles with zero elements.
8942   if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8943     return -1;
8944
8945   // PALIGNR works on 128-bit lanes.
8946   SmallVector<int, 16> RepeatedMask;
8947   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8948     return -1;
8949
8950   int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8951   if (Rotation <= 0)
8952     return -1;
8953
8954   // PALIGNR rotates bytes, so we need to scale the
8955   // rotation based on how many bytes are in the vector lane.
8956   int NumElts = RepeatedMask.size();
8957   int Scale = 16 / NumElts;
8958   return Rotation * Scale;
8959 }
8960
8961 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8962                                               SDValue V1, SDValue V2,
8963                                               ArrayRef<int> Mask,
8964                                               const X86Subtarget &Subtarget,
8965                                               SelectionDAG &DAG) {
8966   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8967
8968   SDValue Lo = V1, Hi = V2;
8969   int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8970   if (ByteRotation <= 0)
8971     return SDValue();
8972
8973   // Cast the inputs to i8 vector of correct length to match PALIGNR or
8974   // PSLLDQ/PSRLDQ.
8975   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8976   Lo = DAG.getBitcast(ByteVT, Lo);
8977   Hi = DAG.getBitcast(ByteVT, Hi);
8978
8979   // SSSE3 targets can use the palignr instruction.
8980   if (Subtarget.hasSSSE3()) {
8981     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8982            "512-bit PALIGNR requires BWI instructions");
8983     return DAG.getBitcast(
8984         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8985                         DAG.getConstant(ByteRotation, DL, MVT::i8)));
8986   }
8987
8988   assert(VT.is128BitVector() &&
8989          "Rotate-based lowering only supports 128-bit lowering!");
8990   assert(Mask.size() <= 16 &&
8991          "Can shuffle at most 16 bytes in a 128-bit vector!");
8992   assert(ByteVT == MVT::v16i8 &&
8993          "SSE2 rotate lowering only needed for v16i8!");
8994
8995   // Default SSE2 implementation
8996   int LoByteShift = 16 - ByteRotation;
8997   int HiByteShift = ByteRotation;
8998
8999   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9000                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
9001   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9002                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
9003   return DAG.getBitcast(VT,
9004                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9005 }
9006
9007 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9008 ///
9009 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9010 /// rotation of the concatenation of two vectors; This routine will
9011 /// try to generically lower a vector shuffle through such an pattern.
9012 ///
9013 /// Essentially it concatenates V1 and V2, shifts right by some number of
9014 /// elements, and takes the low elements as the result. Note that while this is
9015 /// specified as a *right shift* because x86 is little-endian, it is a *left
9016 /// rotate* of the vector lanes.
9017 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9018                                           SDValue V1, SDValue V2,
9019                                           ArrayRef<int> Mask,
9020                                           const X86Subtarget &Subtarget,
9021                                           SelectionDAG &DAG) {
9022   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9023          "Only 32-bit and 64-bit elements are supported!");
9024
9025   // 128/256-bit vectors are only supported with VLX.
9026   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9027          && "VLX required for 128/256-bit vectors");
9028
9029   SDValue Lo = V1, Hi = V2;
9030   int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9031   if (Rotation <= 0)
9032     return SDValue();
9033
9034   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9035                      DAG.getConstant(Rotation, DL, MVT::i8));
9036 }
9037
9038 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9039 ///
9040 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9041 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9042 /// matches elements from one of the input vectors shuffled to the left or
9043 /// right with zeroable elements 'shifted in'. It handles both the strictly
9044 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9045 /// quad word lane.
9046 ///
9047 /// PSHL : (little-endian) left bit shift.
9048 /// [ zz, 0, zz,  2 ]
9049 /// [ -1, 4, zz, -1 ]
9050 /// PSRL : (little-endian) right bit shift.
9051 /// [  1, zz,  3, zz]
9052 /// [ -1, -1,  7, zz]
9053 /// PSLLDQ : (little-endian) left byte shift
9054 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
9055 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
9056 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
9057 /// PSRLDQ : (little-endian) right byte shift
9058 /// [  5, 6,  7, zz, zz, zz, zz, zz]
9059 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
9060 /// [  1, 2, -1, -1, -1, -1, zz, zz]
9061 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9062                                      unsigned ScalarSizeInBits,
9063                                      ArrayRef<int> Mask, int MaskOffset,
9064                                      const APInt &Zeroable,
9065                                      const X86Subtarget &Subtarget) {
9066   int Size = Mask.size();
9067   unsigned SizeInBits = Size * ScalarSizeInBits;
9068
9069   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9070     for (int i = 0; i < Size; i += Scale)
9071       for (int j = 0; j < Shift; ++j)
9072         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9073           return false;
9074
9075     return true;
9076   };
9077
9078   auto MatchShift = [&](int Shift, int Scale, bool Left) {
9079     for (int i = 0; i != Size; i += Scale) {
9080       unsigned Pos = Left ? i + Shift : i;
9081       unsigned Low = Left ? i : i + Shift;
9082       unsigned Len = Scale - Shift;
9083       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9084         return -1;
9085     }
9086
9087     int ShiftEltBits = ScalarSizeInBits * Scale;
9088     bool ByteShift = ShiftEltBits > 64;
9089     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9090                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9091     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9092
9093     // Normalize the scale for byte shifts to still produce an i64 element
9094     // type.
9095     Scale = ByteShift ? Scale / 2 : Scale;
9096
9097     // We need to round trip through the appropriate type for the shift.
9098     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9099     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9100                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
9101     return (int)ShiftAmt;
9102   };
9103
9104   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9105   // keep doubling the size of the integer elements up to that. We can
9106   // then shift the elements of the integer vector by whole multiples of
9107   // their width within the elements of the larger integer vector. Test each
9108   // multiple to see if we can find a match with the moved element indices
9109   // and that the shifted in elements are all zeroable.
9110   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9111   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9112     for (int Shift = 1; Shift != Scale; ++Shift)
9113       for (bool Left : {true, false})
9114         if (CheckZeros(Shift, Scale, Left)) {
9115           int ShiftAmt = MatchShift(Shift, Scale, Left);
9116           if (0 < ShiftAmt)
9117             return ShiftAmt;
9118         }
9119
9120   // no match
9121   return -1;
9122 }
9123
9124 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9125                                          SDValue V2, ArrayRef<int> Mask,
9126                                          const APInt &Zeroable,
9127                                          const X86Subtarget &Subtarget,
9128                                          SelectionDAG &DAG) {
9129   int Size = Mask.size();
9130   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9131
9132   MVT ShiftVT;
9133   SDValue V = V1;
9134   unsigned Opcode;
9135
9136   // Try to match shuffle against V1 shift.
9137   int ShiftAmt = matchVectorShuffleAsShift(
9138       ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9139
9140   // If V1 failed, try to match shuffle against V2 shift.
9141   if (ShiftAmt < 0) {
9142     ShiftAmt =
9143         matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9144                                   Mask, Size, Zeroable, Subtarget);
9145     V = V2;
9146   }
9147
9148   if (ShiftAmt < 0)
9149     return SDValue();
9150
9151   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9152          "Illegal integer vector type");
9153   V = DAG.getBitcast(ShiftVT, V);
9154   V = DAG.getNode(Opcode, DL, ShiftVT, V,
9155                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
9156   return DAG.getBitcast(VT, V);
9157 }
9158
9159 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9160 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9161                                            SDValue V2, ArrayRef<int> Mask,
9162                                            const APInt &Zeroable,
9163                                            SelectionDAG &DAG) {
9164   int Size = Mask.size();
9165   int HalfSize = Size / 2;
9166   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9167   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9168
9169   // Upper half must be undefined.
9170   if (!isUndefInRange(Mask, HalfSize, HalfSize))
9171     return SDValue();
9172
9173   // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9174   // Remainder of lower half result is zero and upper half is all undef.
9175   auto LowerAsEXTRQ = [&]() {
9176     // Determine the extraction length from the part of the
9177     // lower half that isn't zeroable.
9178     int Len = HalfSize;
9179     for (; Len > 0; --Len)
9180       if (!Zeroable[Len - 1])
9181         break;
9182     assert(Len > 0 && "Zeroable shuffle mask");
9183
9184     // Attempt to match first Len sequential elements from the lower half.
9185     SDValue Src;
9186     int Idx = -1;
9187     for (int i = 0; i != Len; ++i) {
9188       int M = Mask[i];
9189       if (M < 0)
9190         continue;
9191       SDValue &V = (M < Size ? V1 : V2);
9192       M = M % Size;
9193
9194       // The extracted elements must start at a valid index and all mask
9195       // elements must be in the lower half.
9196       if (i > M || M >= HalfSize)
9197         return SDValue();
9198
9199       if (Idx < 0 || (Src == V && Idx == (M - i))) {
9200         Src = V;
9201         Idx = M - i;
9202         continue;
9203       }
9204       return SDValue();
9205     }
9206
9207     if (Idx < 0)
9208       return SDValue();
9209
9210     assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9211     int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9212     int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9213     return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
9214                        DAG.getConstant(BitLen, DL, MVT::i8),
9215                        DAG.getConstant(BitIdx, DL, MVT::i8));
9216   };
9217
9218   if (SDValue ExtrQ = LowerAsEXTRQ())
9219     return ExtrQ;
9220
9221   // INSERTQ: Extract lowest Len elements from lower half of second source and
9222   // insert over first source, starting at Idx.
9223   // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9224   auto LowerAsInsertQ = [&]() {
9225     for (int Idx = 0; Idx != HalfSize; ++Idx) {
9226       SDValue Base;
9227
9228       // Attempt to match first source from mask before insertion point.
9229       if (isUndefInRange(Mask, 0, Idx)) {
9230         /* EMPTY */
9231       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9232         Base = V1;
9233       } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9234         Base = V2;
9235       } else {
9236         continue;
9237       }
9238
9239       // Extend the extraction length looking to match both the insertion of
9240       // the second source and the remaining elements of the first.
9241       for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9242         SDValue Insert;
9243         int Len = Hi - Idx;
9244
9245         // Match insertion.
9246         if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9247           Insert = V1;
9248         } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9249           Insert = V2;
9250         } else {
9251           continue;
9252         }
9253
9254         // Match the remaining elements of the lower half.
9255         if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9256           /* EMPTY */
9257         } else if ((!Base || (Base == V1)) &&
9258                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9259           Base = V1;
9260         } else if ((!Base || (Base == V2)) &&
9261                    isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9262                                               Size + Hi)) {
9263           Base = V2;
9264         } else {
9265           continue;
9266         }
9267
9268         // We may not have a base (first source) - this can safely be undefined.
9269         if (!Base)
9270           Base = DAG.getUNDEF(VT);
9271
9272         int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9273         int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9274         return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
9275                            DAG.getConstant(BitLen, DL, MVT::i8),
9276                            DAG.getConstant(BitIdx, DL, MVT::i8));
9277       }
9278     }
9279
9280     return SDValue();
9281   };
9282
9283   if (SDValue InsertQ = LowerAsInsertQ())
9284     return InsertQ;
9285
9286   return SDValue();
9287 }
9288
9289 /// \brief Lower a vector shuffle as a zero or any extension.
9290 ///
9291 /// Given a specific number of elements, element bit width, and extension
9292 /// stride, produce either a zero or any extension based on the available
9293 /// features of the subtarget. The extended elements are consecutive and
9294 /// begin and can start from an offsetted element index in the input; to
9295 /// avoid excess shuffling the offset must either being in the bottom lane
9296 /// or at the start of a higher lane. All extended elements must be from
9297 /// the same lane.
9298 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9299     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9300     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9301   assert(Scale > 1 && "Need a scale to extend.");
9302   int EltBits = VT.getScalarSizeInBits();
9303   int NumElements = VT.getVectorNumElements();
9304   int NumEltsPerLane = 128 / EltBits;
9305   int OffsetLane = Offset / NumEltsPerLane;
9306   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9307          "Only 8, 16, and 32 bit elements can be extended.");
9308   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9309   assert(0 <= Offset && "Extension offset must be positive.");
9310   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9311          "Extension offset must be in the first lane or start an upper lane.");
9312
9313   // Check that an index is in same lane as the base offset.
9314   auto SafeOffset = [&](int Idx) {
9315     return OffsetLane == (Idx / NumEltsPerLane);
9316   };
9317
9318   // Shift along an input so that the offset base moves to the first element.
9319   auto ShuffleOffset = [&](SDValue V) {
9320     if (!Offset)
9321       return V;
9322
9323     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9324     for (int i = 0; i * Scale < NumElements; ++i) {
9325       int SrcIdx = i + Offset;
9326       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9327     }
9328     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9329   };
9330
9331   // Found a valid zext mask! Try various lowering strategies based on the
9332   // input type and available ISA extensions.
9333   if (Subtarget.hasSSE41()) {
9334     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9335     // PUNPCK will catch this in a later shuffle match.
9336     if (Offset && Scale == 2 && VT.is128BitVector())
9337       return SDValue();
9338     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9339                                  NumElements / Scale);
9340     InputV = ShuffleOffset(InputV);
9341     InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9342     return DAG.getBitcast(VT, InputV);
9343   }
9344
9345   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9346
9347   // For any extends we can cheat for larger element sizes and use shuffle
9348   // instructions that can fold with a load and/or copy.
9349   if (AnyExt && EltBits == 32) {
9350     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9351                          -1};
9352     return DAG.getBitcast(
9353         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9354                         DAG.getBitcast(MVT::v4i32, InputV),
9355                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9356   }
9357   if (AnyExt && EltBits == 16 && Scale > 2) {
9358     int PSHUFDMask[4] = {Offset / 2, -1,
9359                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9360     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9361                          DAG.getBitcast(MVT::v4i32, InputV),
9362                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9363     int PSHUFWMask[4] = {1, -1, -1, -1};
9364     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9365     return DAG.getBitcast(
9366         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9367                         DAG.getBitcast(MVT::v8i16, InputV),
9368                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9369   }
9370
9371   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9372   // to 64-bits.
9373   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9374     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9375     assert(VT.is128BitVector() && "Unexpected vector width!");
9376
9377     int LoIdx = Offset * EltBits;
9378     SDValue Lo = DAG.getBitcast(
9379         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9380                                 DAG.getConstant(EltBits, DL, MVT::i8),
9381                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
9382
9383     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9384         !SafeOffset(Offset + 1))
9385       return DAG.getBitcast(VT, Lo);
9386
9387     int HiIdx = (Offset + 1) * EltBits;
9388     SDValue Hi = DAG.getBitcast(
9389         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9390                                 DAG.getConstant(EltBits, DL, MVT::i8),
9391                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
9392     return DAG.getBitcast(VT,
9393                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9394   }
9395
9396   // If this would require more than 2 unpack instructions to expand, use
9397   // pshufb when available. We can only use more than 2 unpack instructions
9398   // when zero extending i8 elements which also makes it easier to use pshufb.
9399   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9400     assert(NumElements == 16 && "Unexpected byte vector width!");
9401     SDValue PSHUFBMask[16];
9402     for (int i = 0; i < 16; ++i) {
9403       int Idx = Offset + (i / Scale);
9404       PSHUFBMask[i] = DAG.getConstant(
9405           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9406     }
9407     InputV = DAG.getBitcast(MVT::v16i8, InputV);
9408     return DAG.getBitcast(
9409         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9410                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9411   }
9412
9413   // If we are extending from an offset, ensure we start on a boundary that
9414   // we can unpack from.
9415   int AlignToUnpack = Offset % (NumElements / Scale);
9416   if (AlignToUnpack) {
9417     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9418     for (int i = AlignToUnpack; i < NumElements; ++i)
9419       ShMask[i - AlignToUnpack] = i;
9420     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9421     Offset -= AlignToUnpack;
9422   }
9423
9424   // Otherwise emit a sequence of unpacks.
9425   do {
9426     unsigned UnpackLoHi = X86ISD::UNPCKL;
9427     if (Offset >= (NumElements / 2)) {
9428       UnpackLoHi = X86ISD::UNPCKH;
9429       Offset -= (NumElements / 2);
9430     }
9431
9432     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9433     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9434                          : getZeroVector(InputVT, Subtarget, DAG, DL);
9435     InputV = DAG.getBitcast(InputVT, InputV);
9436     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9437     Scale /= 2;
9438     EltBits *= 2;
9439     NumElements /= 2;
9440   } while (Scale > 1);
9441   return DAG.getBitcast(VT, InputV);
9442 }
9443
9444 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9445 ///
9446 /// This routine will try to do everything in its power to cleverly lower
9447 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9448 /// check for the profitability of this lowering,  it tries to aggressively
9449 /// match this pattern. It will use all of the micro-architectural details it
9450 /// can to emit an efficient lowering. It handles both blends with all-zero
9451 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9452 /// masking out later).
9453 ///
9454 /// The reason we have dedicated lowering for zext-style shuffles is that they
9455 /// are both incredibly common and often quite performance sensitive.
9456 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9457     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9458     const APInt &Zeroable, const X86Subtarget &Subtarget,
9459     SelectionDAG &DAG) {
9460   int Bits = VT.getSizeInBits();
9461   int NumLanes = Bits / 128;
9462   int NumElements = VT.getVectorNumElements();
9463   int NumEltsPerLane = NumElements / NumLanes;
9464   assert(VT.getScalarSizeInBits() <= 32 &&
9465          "Exceeds 32-bit integer zero extension limit");
9466   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9467
9468   // Define a helper function to check a particular ext-scale and lower to it if
9469   // valid.
9470   auto Lower = [&](int Scale) -> SDValue {
9471     SDValue InputV;
9472     bool AnyExt = true;
9473     int Offset = 0;
9474     int Matches = 0;
9475     for (int i = 0; i < NumElements; ++i) {
9476       int M = Mask[i];
9477       if (M < 0)
9478         continue; // Valid anywhere but doesn't tell us anything.
9479       if (i % Scale != 0) {
9480         // Each of the extended elements need to be zeroable.
9481         if (!Zeroable[i])
9482           return SDValue();
9483
9484         // We no longer are in the anyext case.
9485         AnyExt = false;
9486         continue;
9487       }
9488
9489       // Each of the base elements needs to be consecutive indices into the
9490       // same input vector.
9491       SDValue V = M < NumElements ? V1 : V2;
9492       M = M % NumElements;
9493       if (!InputV) {
9494         InputV = V;
9495         Offset = M - (i / Scale);
9496       } else if (InputV != V)
9497         return SDValue(); // Flip-flopping inputs.
9498
9499       // Offset must start in the lowest 128-bit lane or at the start of an
9500       // upper lane.
9501       // FIXME: Is it ever worth allowing a negative base offset?
9502       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9503             (Offset % NumEltsPerLane) == 0))
9504         return SDValue();
9505
9506       // If we are offsetting, all referenced entries must come from the same
9507       // lane.
9508       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9509         return SDValue();
9510
9511       if ((M % NumElements) != (Offset + (i / Scale)))
9512         return SDValue(); // Non-consecutive strided elements.
9513       Matches++;
9514     }
9515
9516     // If we fail to find an input, we have a zero-shuffle which should always
9517     // have already been handled.
9518     // FIXME: Maybe handle this here in case during blending we end up with one?
9519     if (!InputV)
9520       return SDValue();
9521
9522     // If we are offsetting, don't extend if we only match a single input, we
9523     // can always do better by using a basic PSHUF or PUNPCK.
9524     if (Offset != 0 && Matches < 2)
9525       return SDValue();
9526
9527     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9528         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9529   };
9530
9531   // The widest scale possible for extending is to a 64-bit integer.
9532   assert(Bits % 64 == 0 &&
9533          "The number of bits in a vector must be divisible by 64 on x86!");
9534   int NumExtElements = Bits / 64;
9535
9536   // Each iteration, try extending the elements half as much, but into twice as
9537   // many elements.
9538   for (; NumExtElements < NumElements; NumExtElements *= 2) {
9539     assert(NumElements % NumExtElements == 0 &&
9540            "The input vector size must be divisible by the extended size.");
9541     if (SDValue V = Lower(NumElements / NumExtElements))
9542       return V;
9543   }
9544
9545   // General extends failed, but 128-bit vectors may be able to use MOVQ.
9546   if (Bits != 128)
9547     return SDValue();
9548
9549   // Returns one of the source operands if the shuffle can be reduced to a
9550   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9551   auto CanZExtLowHalf = [&]() {
9552     for (int i = NumElements / 2; i != NumElements; ++i)
9553       if (!Zeroable[i])
9554         return SDValue();
9555     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9556       return V1;
9557     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9558       return V2;
9559     return SDValue();
9560   };
9561
9562   if (SDValue V = CanZExtLowHalf()) {
9563     V = DAG.getBitcast(MVT::v2i64, V);
9564     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9565     return DAG.getBitcast(VT, V);
9566   }
9567
9568   // No viable ext lowering found.
9569   return SDValue();
9570 }
9571
9572 /// \brief Try to get a scalar value for a specific element of a vector.
9573 ///
9574 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9575 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9576                                               SelectionDAG &DAG) {
9577   MVT VT = V.getSimpleValueType();
9578   MVT EltVT = VT.getVectorElementType();
9579   V = peekThroughBitcasts(V);
9580
9581   // If the bitcasts shift the element size, we can't extract an equivalent
9582   // element from it.
9583   MVT NewVT = V.getSimpleValueType();
9584   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9585     return SDValue();
9586
9587   if (V.getOpcode() == ISD::BUILD_VECTOR ||
9588       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9589     // Ensure the scalar operand is the same size as the destination.
9590     // FIXME: Add support for scalar truncation where possible.
9591     SDValue S = V.getOperand(Idx);
9592     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9593       return DAG.getBitcast(EltVT, S);
9594   }
9595
9596   return SDValue();
9597 }
9598
9599 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9600 ///
9601 /// This is particularly important because the set of instructions varies
9602 /// significantly based on whether the operand is a load or not.
9603 static bool isShuffleFoldableLoad(SDValue V) {
9604   V = peekThroughBitcasts(V);
9605   return ISD::isNON_EXTLoad(V.getNode());
9606 }
9607
9608 /// \brief Try to lower insertion of a single element into a zero vector.
9609 ///
9610 /// This is a common pattern that we have especially efficient patterns to lower
9611 /// across all subtarget feature sets.
9612 static SDValue lowerVectorShuffleAsElementInsertion(
9613     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9614     const APInt &Zeroable, const X86Subtarget &Subtarget,
9615     SelectionDAG &DAG) {
9616   MVT ExtVT = VT;
9617   MVT EltVT = VT.getVectorElementType();
9618
9619   int V2Index =
9620       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9621       Mask.begin();
9622   bool IsV1Zeroable = true;
9623   for (int i = 0, Size = Mask.size(); i < Size; ++i)
9624     if (i != V2Index && !Zeroable[i]) {
9625       IsV1Zeroable = false;
9626       break;
9627     }
9628
9629   // Check for a single input from a SCALAR_TO_VECTOR node.
9630   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9631   // all the smarts here sunk into that routine. However, the current
9632   // lowering of BUILD_VECTOR makes that nearly impossible until the old
9633   // vector shuffle lowering is dead.
9634   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9635                                                DAG);
9636   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9637     // We need to zext the scalar if it is smaller than an i32.
9638     V2S = DAG.getBitcast(EltVT, V2S);
9639     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9640       // Using zext to expand a narrow element won't work for non-zero
9641       // insertions.
9642       if (!IsV1Zeroable)
9643         return SDValue();
9644
9645       // Zero-extend directly to i32.
9646       ExtVT = MVT::v4i32;
9647       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9648     }
9649     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9650   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9651              EltVT == MVT::i16) {
9652     // Either not inserting from the low element of the input or the input
9653     // element size is too small to use VZEXT_MOVL to clear the high bits.
9654     return SDValue();
9655   }
9656
9657   if (!IsV1Zeroable) {
9658     // If V1 can't be treated as a zero vector we have fewer options to lower
9659     // this. We can't support integer vectors or non-zero targets cheaply, and
9660     // the V1 elements can't be permuted in any way.
9661     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9662     if (!VT.isFloatingPoint() || V2Index != 0)
9663       return SDValue();
9664     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9665     V1Mask[V2Index] = -1;
9666     if (!isNoopShuffleMask(V1Mask))
9667       return SDValue();
9668     // This is essentially a special case blend operation, but if we have
9669     // general purpose blend operations, they are always faster. Bail and let
9670     // the rest of the lowering handle these as blends.
9671     if (Subtarget.hasSSE41())
9672       return SDValue();
9673
9674     // Otherwise, use MOVSD or MOVSS.
9675     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9676            "Only two types of floating point element types to handle!");
9677     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9678                        ExtVT, V1, V2);
9679   }
9680
9681   // This lowering only works for the low element with floating point vectors.
9682   if (VT.isFloatingPoint() && V2Index != 0)
9683     return SDValue();
9684
9685   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9686   if (ExtVT != VT)
9687     V2 = DAG.getBitcast(VT, V2);
9688
9689   if (V2Index != 0) {
9690     // If we have 4 or fewer lanes we can cheaply shuffle the element into
9691     // the desired position. Otherwise it is more efficient to do a vector
9692     // shift left. We know that we can do a vector shift left because all
9693     // the inputs are zero.
9694     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9695       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9696       V2Shuffle[V2Index] = 0;
9697       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9698     } else {
9699       V2 = DAG.getBitcast(MVT::v16i8, V2);
9700       V2 = DAG.getNode(
9701           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9702           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9703                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9704                               DAG.getDataLayout(), VT)));
9705       V2 = DAG.getBitcast(VT, V2);
9706     }
9707   }
9708   return V2;
9709 }
9710
9711 /// Try to lower broadcast of a single - truncated - integer element,
9712 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9713 ///
9714 /// This assumes we have AVX2.
9715 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9716                                                   SDValue V0, int BroadcastIdx,
9717                                                   const X86Subtarget &Subtarget,
9718                                                   SelectionDAG &DAG) {
9719   assert(Subtarget.hasAVX2() &&
9720          "We can only lower integer broadcasts with AVX2!");
9721
9722   EVT EltVT = VT.getVectorElementType();
9723   EVT V0VT = V0.getValueType();
9724
9725   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9726   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9727
9728   EVT V0EltVT = V0VT.getVectorElementType();
9729   if (!V0EltVT.isInteger())
9730     return SDValue();
9731
9732   const unsigned EltSize = EltVT.getSizeInBits();
9733   const unsigned V0EltSize = V0EltVT.getSizeInBits();
9734
9735   // This is only a truncation if the original element type is larger.
9736   if (V0EltSize <= EltSize)
9737     return SDValue();
9738
9739   assert(((V0EltSize % EltSize) == 0) &&
9740          "Scalar type sizes must all be powers of 2 on x86!");
9741
9742   const unsigned V0Opc = V0.getOpcode();
9743   const unsigned Scale = V0EltSize / EltSize;
9744   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9745
9746   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9747       V0Opc != ISD::BUILD_VECTOR)
9748     return SDValue();
9749
9750   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9751
9752   // If we're extracting non-least-significant bits, shift so we can truncate.
9753   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9754   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9755   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9756   if (const int OffsetIdx = BroadcastIdx % Scale)
9757     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9758             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9759
9760   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9761                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9762 }
9763
9764 /// \brief Try to lower broadcast of a single element.
9765 ///
9766 /// For convenience, this code also bundles all of the subtarget feature set
9767 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9768 /// a convenient way to factor it out.
9769 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9770                                              SDValue V1, SDValue V2,
9771                                              ArrayRef<int> Mask,
9772                                              const X86Subtarget &Subtarget,
9773                                              SelectionDAG &DAG) {
9774   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9775         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9776         (Subtarget.hasAVX2() && VT.isInteger())))
9777     return SDValue();
9778
9779   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9780   // we can only broadcast from a register with AVX2.
9781   unsigned NumElts = Mask.size();
9782   unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9783   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9784
9785   // Check that the mask is a broadcast.
9786   int BroadcastIdx = -1;
9787   for (int i = 0; i != (int)NumElts; ++i) {
9788     SmallVector<int, 8> BroadcastMask(NumElts, i);
9789     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9790       BroadcastIdx = i;
9791       break;
9792     }
9793   }
9794
9795   if (BroadcastIdx < 0)
9796     return SDValue();
9797   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9798                                             "a sorted mask where the broadcast "
9799                                             "comes from V1.");
9800
9801   // Go up the chain of (vector) values to find a scalar load that we can
9802   // combine with the broadcast.
9803   SDValue V = V1;
9804   for (;;) {
9805     switch (V.getOpcode()) {
9806     case ISD::BITCAST: {
9807       SDValue VSrc = V.getOperand(0);
9808       MVT SrcVT = VSrc.getSimpleValueType();
9809       if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9810         break;
9811       V = VSrc;
9812       continue;
9813     }
9814     case ISD::CONCAT_VECTORS: {
9815       int OperandSize = Mask.size() / V.getNumOperands();
9816       V = V.getOperand(BroadcastIdx / OperandSize);
9817       BroadcastIdx %= OperandSize;
9818       continue;
9819     }
9820     case ISD::INSERT_SUBVECTOR: {
9821       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9822       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9823       if (!ConstantIdx)
9824         break;
9825
9826       int BeginIdx = (int)ConstantIdx->getZExtValue();
9827       int EndIdx =
9828           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9829       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9830         BroadcastIdx -= BeginIdx;
9831         V = VInner;
9832       } else {
9833         V = VOuter;
9834       }
9835       continue;
9836     }
9837     }
9838     break;
9839   }
9840
9841   // Check if this is a broadcast of a scalar. We special case lowering
9842   // for scalars so that we can more effectively fold with loads.
9843   // First, look through bitcast: if the original value has a larger element
9844   // type than the shuffle, the broadcast element is in essence truncated.
9845   // Make that explicit to ease folding.
9846   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9847     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9848             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9849       return TruncBroadcast;
9850
9851   MVT BroadcastVT = VT;
9852
9853   // Peek through any bitcast (only useful for loads).
9854   SDValue BC = peekThroughBitcasts(V);
9855
9856   // Also check the simpler case, where we can directly reuse the scalar.
9857   if (V.getOpcode() == ISD::BUILD_VECTOR ||
9858       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9859     V = V.getOperand(BroadcastIdx);
9860
9861     // If we can't broadcast from a register, check that the input is a load.
9862     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9863       return SDValue();
9864   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9865     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9866     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9867       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9868       Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9869     }
9870
9871     // If we are broadcasting a load that is only used by the shuffle
9872     // then we can reduce the vector load to the broadcasted scalar load.
9873     LoadSDNode *Ld = cast<LoadSDNode>(BC);
9874     SDValue BaseAddr = Ld->getOperand(1);
9875     EVT SVT = BroadcastVT.getScalarType();
9876     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9877     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9878     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9879                     DAG.getMachineFunction().getMachineMemOperand(
9880                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9881
9882     // Make sure the newly-created LOAD is in the same position as Ld in
9883     // terms of dependency. We create a TokenFactor for Ld and V,
9884     // and update uses of Ld's output chain to use the TokenFactor.
9885     if (Ld->hasAnyUseOfValue(1)) {
9886       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9887                                      SDValue(Ld, 1), SDValue(V.getNode(), 1));
9888       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9889       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9890                              SDValue(V.getNode(), 1));
9891     }
9892   } else if (!BroadcastFromReg) {
9893     // We can't broadcast from a vector register.
9894     return SDValue();
9895   } else if (BroadcastIdx != 0) {
9896     // We can only broadcast from the zero-element of a vector register,
9897     // but it can be advantageous to broadcast from the zero-element of a
9898     // subvector.
9899     if (!VT.is256BitVector() && !VT.is512BitVector())
9900       return SDValue();
9901
9902     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9903     if (VT == MVT::v4f64 || VT == MVT::v4i64)
9904       return SDValue();
9905
9906     // Only broadcast the zero-element of a 128-bit subvector.
9907     unsigned EltSize = VT.getScalarSizeInBits();
9908     if (((BroadcastIdx * EltSize) % 128) != 0)
9909       return SDValue();
9910
9911     // The shuffle input might have been a bitcast we looked through; look at
9912     // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
9913     // later bitcast it to BroadcastVT.
9914     MVT SrcVT = V.getSimpleValueType();
9915     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9916            "Unexpected vector element size");
9917     assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
9918            "Unexpected vector size");
9919
9920     MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
9921     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9922                     DAG.getIntPtrConstant(BroadcastIdx, DL));
9923   }
9924
9925   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9926     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9927                     DAG.getBitcast(MVT::f64, V));
9928
9929   // Bitcast back to the same scalar type as BroadcastVT.
9930   MVT SrcVT = V.getSimpleValueType();
9931   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9932     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9933            "Unexpected vector element size");
9934     if (SrcVT.isVector()) {
9935       unsigned NumSrcElts = SrcVT.getVectorNumElements();
9936       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9937     } else {
9938       SrcVT = BroadcastVT.getScalarType();
9939     }
9940     V = DAG.getBitcast(SrcVT, V);
9941   }
9942
9943   // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9944   if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9945     V = DAG.getBitcast(MVT::f64, V);
9946     unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9947     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9948   }
9949
9950   // We only support broadcasting from 128-bit vectors to minimize the
9951   // number of patterns we need to deal with in isel. So extract down to
9952   // 128-bits.
9953   if (SrcVT.getSizeInBits() > 128)
9954     V = extract128BitVector(V, 0, DAG, DL);
9955
9956   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9957 }
9958
9959 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9960 // INSERTPS when the V1 elements are already in the correct locations
9961 // because otherwise we can just always use two SHUFPS instructions which
9962 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9963 // perform INSERTPS if a single V1 element is out of place and all V2
9964 // elements are zeroable.
9965 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9966                                          unsigned &InsertPSMask,
9967                                          const APInt &Zeroable,
9968                                          ArrayRef<int> Mask,
9969                                          SelectionDAG &DAG) {
9970   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9971   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9972   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9973
9974   // Attempt to match INSERTPS with one element from VA or VB being
9975   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9976   // are updated.
9977   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9978                              ArrayRef<int> CandidateMask) {
9979     unsigned ZMask = 0;
9980     int VADstIndex = -1;
9981     int VBDstIndex = -1;
9982     bool VAUsedInPlace = false;
9983
9984     for (int i = 0; i < 4; ++i) {
9985       // Synthesize a zero mask from the zeroable elements (includes undefs).
9986       if (Zeroable[i]) {
9987         ZMask |= 1 << i;
9988         continue;
9989       }
9990
9991       // Flag if we use any VA inputs in place.
9992       if (i == CandidateMask[i]) {
9993         VAUsedInPlace = true;
9994         continue;
9995       }
9996
9997       // We can only insert a single non-zeroable element.
9998       if (VADstIndex >= 0 || VBDstIndex >= 0)
9999         return false;
10000
10001       if (CandidateMask[i] < 4) {
10002         // VA input out of place for insertion.
10003         VADstIndex = i;
10004       } else {
10005         // VB input for insertion.
10006         VBDstIndex = i;
10007       }
10008     }
10009
10010     // Don't bother if we have no (non-zeroable) element for insertion.
10011     if (VADstIndex < 0 && VBDstIndex < 0)
10012       return false;
10013
10014     // Determine element insertion src/dst indices. The src index is from the
10015     // start of the inserted vector, not the start of the concatenated vector.
10016     unsigned VBSrcIndex = 0;
10017     if (VADstIndex >= 0) {
10018       // If we have a VA input out of place, we use VA as the V2 element
10019       // insertion and don't use the original V2 at all.
10020       VBSrcIndex = CandidateMask[VADstIndex];
10021       VBDstIndex = VADstIndex;
10022       VB = VA;
10023     } else {
10024       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10025     }
10026
10027     // If no V1 inputs are used in place, then the result is created only from
10028     // the zero mask and the V2 insertion - so remove V1 dependency.
10029     if (!VAUsedInPlace)
10030       VA = DAG.getUNDEF(MVT::v4f32);
10031
10032     // Update V1, V2 and InsertPSMask accordingly.
10033     V1 = VA;
10034     V2 = VB;
10035
10036     // Insert the V2 element into the desired position.
10037     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10038     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10039     return true;
10040   };
10041
10042   if (matchAsInsertPS(V1, V2, Mask))
10043     return true;
10044
10045   // Commute and try again.
10046   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10047   ShuffleVectorSDNode::commuteMask(CommutedMask);
10048   if (matchAsInsertPS(V2, V1, CommutedMask))
10049     return true;
10050
10051   return false;
10052 }
10053
10054 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10055                                             SDValue V2, ArrayRef<int> Mask,
10056                                             const APInt &Zeroable,
10057                                             SelectionDAG &DAG) {
10058   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10059   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10060
10061   // Attempt to match the insertps pattern.
10062   unsigned InsertPSMask;
10063   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10064     return SDValue();
10065
10066   // Insert the V2 element into the desired position.
10067   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10068                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
10069 }
10070
10071 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10072 /// UNPCK instruction.
10073 ///
10074 /// This specifically targets cases where we end up with alternating between
10075 /// the two inputs, and so can permute them into something that feeds a single
10076 /// UNPCK instruction. Note that this routine only targets integer vectors
10077 /// because for floating point vectors we have a generalized SHUFPS lowering
10078 /// strategy that handles everything that doesn't *exactly* match an unpack,
10079 /// making this clever lowering unnecessary.
10080 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10081                                                     SDValue V1, SDValue V2,
10082                                                     ArrayRef<int> Mask,
10083                                                     SelectionDAG &DAG) {
10084   assert(!VT.isFloatingPoint() &&
10085          "This routine only supports integer vectors.");
10086   assert(VT.is128BitVector() &&
10087          "This routine only works on 128-bit vectors.");
10088   assert(!V2.isUndef() &&
10089          "This routine should only be used when blending two inputs.");
10090   assert(Mask.size() >= 2 && "Single element masks are invalid.");
10091
10092   int Size = Mask.size();
10093
10094   int NumLoInputs =
10095       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10096   int NumHiInputs =
10097       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10098
10099   bool UnpackLo = NumLoInputs >= NumHiInputs;
10100
10101   auto TryUnpack = [&](int ScalarSize, int Scale) {
10102     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10103     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10104
10105     for (int i = 0; i < Size; ++i) {
10106       if (Mask[i] < 0)
10107         continue;
10108
10109       // Each element of the unpack contains Scale elements from this mask.
10110       int UnpackIdx = i / Scale;
10111
10112       // We only handle the case where V1 feeds the first slots of the unpack.
10113       // We rely on canonicalization to ensure this is the case.
10114       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10115         return SDValue();
10116
10117       // Setup the mask for this input. The indexing is tricky as we have to
10118       // handle the unpack stride.
10119       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10120       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10121           Mask[i] % Size;
10122     }
10123
10124     // If we will have to shuffle both inputs to use the unpack, check whether
10125     // we can just unpack first and shuffle the result. If so, skip this unpack.
10126     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10127         !isNoopShuffleMask(V2Mask))
10128       return SDValue();
10129
10130     // Shuffle the inputs into place.
10131     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10132     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10133
10134     // Cast the inputs to the type we will use to unpack them.
10135     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10136     V1 = DAG.getBitcast(UnpackVT, V1);
10137     V2 = DAG.getBitcast(UnpackVT, V2);
10138
10139     // Unpack the inputs and cast the result back to the desired type.
10140     return DAG.getBitcast(
10141         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10142                         UnpackVT, V1, V2));
10143   };
10144
10145   // We try each unpack from the largest to the smallest to try and find one
10146   // that fits this mask.
10147   int OrigScalarSize = VT.getScalarSizeInBits();
10148   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10149     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10150       return Unpack;
10151
10152   // If none of the unpack-rooted lowerings worked (or were profitable) try an
10153   // initial unpack.
10154   if (NumLoInputs == 0 || NumHiInputs == 0) {
10155     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10156            "We have to have *some* inputs!");
10157     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10158
10159     // FIXME: We could consider the total complexity of the permute of each
10160     // possible unpacking. Or at the least we should consider how many
10161     // half-crossings are created.
10162     // FIXME: We could consider commuting the unpacks.
10163
10164     SmallVector<int, 32> PermMask((unsigned)Size, -1);
10165     for (int i = 0; i < Size; ++i) {
10166       if (Mask[i] < 0)
10167         continue;
10168
10169       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10170
10171       PermMask[i] =
10172           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10173     }
10174     return DAG.getVectorShuffle(
10175         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10176                             DL, VT, V1, V2),
10177         DAG.getUNDEF(VT), PermMask);
10178   }
10179
10180   return SDValue();
10181 }
10182
10183 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10184 ///
10185 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10186 /// support for floating point shuffles but not integer shuffles. These
10187 /// instructions will incur a domain crossing penalty on some chips though so
10188 /// it is better to avoid lowering through this for integer vectors where
10189 /// possible.
10190 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10191                                        const APInt &Zeroable,
10192                                        SDValue V1, SDValue V2,
10193                                        const X86Subtarget &Subtarget,
10194                                        SelectionDAG &DAG) {
10195   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10196   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10197   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10198
10199   if (V2.isUndef()) {
10200     // Check for being able to broadcast a single element.
10201     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10202             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10203       return Broadcast;
10204
10205     // Straight shuffle of a single input vector. Simulate this by using the
10206     // single input as both of the "inputs" to this instruction..
10207     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10208
10209     if (Subtarget.hasAVX()) {
10210       // If we have AVX, we can use VPERMILPS which will allow folding a load
10211       // into the shuffle.
10212       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10213                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10214     }
10215
10216     return DAG.getNode(
10217         X86ISD::SHUFP, DL, MVT::v2f64,
10218         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10219         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10220         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10221   }
10222   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10223   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10224
10225   // If we have a single input, insert that into V1 if we can do so cheaply.
10226   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10227     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10228             DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10229       return Insertion;
10230     // Try inverting the insertion since for v2 masks it is easy to do and we
10231     // can't reliably sort the mask one way or the other.
10232     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10233                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10234     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10235             DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10236       return Insertion;
10237   }
10238
10239   // Try to use one of the special instruction patterns to handle two common
10240   // blend patterns if a zero-blend above didn't work.
10241   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10242       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10243     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10244       // We can either use a special instruction to load over the low double or
10245       // to move just the low double.
10246       return DAG.getNode(
10247           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10248           DL, MVT::v2f64, V2,
10249           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10250
10251   if (Subtarget.hasSSE41())
10252     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10253                                                   Zeroable, Subtarget, DAG))
10254       return Blend;
10255
10256   // Use dedicated unpack instructions for masks that match their pattern.
10257   if (SDValue V =
10258           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10259     return V;
10260
10261   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10262   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10263                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10264 }
10265
10266 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10267 ///
10268 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10269 /// the integer unit to minimize domain crossing penalties. However, for blends
10270 /// it falls back to the floating point shuffle operation with appropriate bit
10271 /// casting.
10272 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10273                                        const APInt &Zeroable,
10274                                        SDValue V1, SDValue V2,
10275                                        const X86Subtarget &Subtarget,
10276                                        SelectionDAG &DAG) {
10277   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10278   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10279   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10280
10281   if (V2.isUndef()) {
10282     // Check for being able to broadcast a single element.
10283     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10284             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10285       return Broadcast;
10286
10287     // Straight shuffle of a single input vector. For everything from SSE2
10288     // onward this has a single fast instruction with no scary immediates.
10289     // We have to map the mask as it is actually a v4i32 shuffle instruction.
10290     V1 = DAG.getBitcast(MVT::v4i32, V1);
10291     int WidenedMask[4] = {
10292         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10293         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10294     return DAG.getBitcast(
10295         MVT::v2i64,
10296         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10297                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10298   }
10299   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10300   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10301   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10302   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10303
10304   // If we have a blend of two same-type PACKUS operations and the blend aligns
10305   // with the low and high halves, we can just merge the PACKUS operations.
10306   // This is particularly important as it lets us merge shuffles that this
10307   // routine itself creates.
10308   auto GetPackNode = [](SDValue V) {
10309     V = peekThroughBitcasts(V);
10310     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10311   };
10312   if (SDValue V1Pack = GetPackNode(V1))
10313     if (SDValue V2Pack = GetPackNode(V2)) {
10314       EVT PackVT = V1Pack.getValueType();
10315       if (PackVT == V2Pack.getValueType())
10316         return DAG.getBitcast(MVT::v2i64,
10317                               DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10318                                           Mask[0] == 0 ? V1Pack.getOperand(0)
10319                                                        : V1Pack.getOperand(1),
10320                                           Mask[1] == 2 ? V2Pack.getOperand(0)
10321                                                        : V2Pack.getOperand(1)));
10322     }
10323
10324   // Try to use shift instructions.
10325   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10326                                                 Zeroable, Subtarget, DAG))
10327     return Shift;
10328
10329   // When loading a scalar and then shuffling it into a vector we can often do
10330   // the insertion cheaply.
10331   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10332           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10333     return Insertion;
10334   // Try inverting the insertion since for v2 masks it is easy to do and we
10335   // can't reliably sort the mask one way or the other.
10336   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10337   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10338           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10339     return Insertion;
10340
10341   // We have different paths for blend lowering, but they all must use the
10342   // *exact* same predicate.
10343   bool IsBlendSupported = Subtarget.hasSSE41();
10344   if (IsBlendSupported)
10345     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10346                                                   Zeroable, Subtarget, DAG))
10347       return Blend;
10348
10349   // Use dedicated unpack instructions for masks that match their pattern.
10350   if (SDValue V =
10351           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10352     return V;
10353
10354   // Try to use byte rotation instructions.
10355   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10356   if (Subtarget.hasSSSE3())
10357     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10358             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10359       return Rotate;
10360
10361   // If we have direct support for blends, we should lower by decomposing into
10362   // a permute. That will be faster than the domain cross.
10363   if (IsBlendSupported)
10364     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10365                                                       Mask, DAG);
10366
10367   // We implement this with SHUFPD which is pretty lame because it will likely
10368   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10369   // However, all the alternatives are still more cycles and newer chips don't
10370   // have this problem. It would be really nice if x86 had better shuffles here.
10371   V1 = DAG.getBitcast(MVT::v2f64, V1);
10372   V2 = DAG.getBitcast(MVT::v2f64, V2);
10373   return DAG.getBitcast(MVT::v2i64,
10374                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10375 }
10376
10377 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10378 ///
10379 /// This is used to disable more specialized lowerings when the shufps lowering
10380 /// will happen to be efficient.
10381 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10382   // This routine only handles 128-bit shufps.
10383   assert(Mask.size() == 4 && "Unsupported mask size!");
10384   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10385   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10386   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10387   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10388
10389   // To lower with a single SHUFPS we need to have the low half and high half
10390   // each requiring a single input.
10391   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10392     return false;
10393   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10394     return false;
10395
10396   return true;
10397 }
10398
10399 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10400 ///
10401 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10402 /// It makes no assumptions about whether this is the *best* lowering, it simply
10403 /// uses it.
10404 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10405                                             ArrayRef<int> Mask, SDValue V1,
10406                                             SDValue V2, SelectionDAG &DAG) {
10407   SDValue LowV = V1, HighV = V2;
10408   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10409
10410   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10411
10412   if (NumV2Elements == 1) {
10413     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10414
10415     // Compute the index adjacent to V2Index and in the same half by toggling
10416     // the low bit.
10417     int V2AdjIndex = V2Index ^ 1;
10418
10419     if (Mask[V2AdjIndex] < 0) {
10420       // Handles all the cases where we have a single V2 element and an undef.
10421       // This will only ever happen in the high lanes because we commute the
10422       // vector otherwise.
10423       if (V2Index < 2)
10424         std::swap(LowV, HighV);
10425       NewMask[V2Index] -= 4;
10426     } else {
10427       // Handle the case where the V2 element ends up adjacent to a V1 element.
10428       // To make this work, blend them together as the first step.
10429       int V1Index = V2AdjIndex;
10430       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10431       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10432                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10433
10434       // Now proceed to reconstruct the final blend as we have the necessary
10435       // high or low half formed.
10436       if (V2Index < 2) {
10437         LowV = V2;
10438         HighV = V1;
10439       } else {
10440         HighV = V2;
10441       }
10442       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10443       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10444     }
10445   } else if (NumV2Elements == 2) {
10446     if (Mask[0] < 4 && Mask[1] < 4) {
10447       // Handle the easy case where we have V1 in the low lanes and V2 in the
10448       // high lanes.
10449       NewMask[2] -= 4;
10450       NewMask[3] -= 4;
10451     } else if (Mask[2] < 4 && Mask[3] < 4) {
10452       // We also handle the reversed case because this utility may get called
10453       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10454       // arrange things in the right direction.
10455       NewMask[0] -= 4;
10456       NewMask[1] -= 4;
10457       HighV = V1;
10458       LowV = V2;
10459     } else {
10460       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10461       // trying to place elements directly, just blend them and set up the final
10462       // shuffle to place them.
10463
10464       // The first two blend mask elements are for V1, the second two are for
10465       // V2.
10466       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10467                           Mask[2] < 4 ? Mask[2] : Mask[3],
10468                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10469                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10470       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10471                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10472
10473       // Now we do a normal shuffle of V1 by giving V1 as both operands to
10474       // a blend.
10475       LowV = HighV = V1;
10476       NewMask[0] = Mask[0] < 4 ? 0 : 2;
10477       NewMask[1] = Mask[0] < 4 ? 2 : 0;
10478       NewMask[2] = Mask[2] < 4 ? 1 : 3;
10479       NewMask[3] = Mask[2] < 4 ? 3 : 1;
10480     }
10481   }
10482   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10483                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10484 }
10485
10486 /// \brief Lower 4-lane 32-bit floating point shuffles.
10487 ///
10488 /// Uses instructions exclusively from the floating point unit to minimize
10489 /// domain crossing penalties, as these are sufficient to implement all v4f32
10490 /// shuffles.
10491 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10492                                        const APInt &Zeroable,
10493                                        SDValue V1, SDValue V2,
10494                                        const X86Subtarget &Subtarget,
10495                                        SelectionDAG &DAG) {
10496   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10497   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10498   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10499
10500   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10501
10502   if (NumV2Elements == 0) {
10503     // Check for being able to broadcast a single element.
10504     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10505             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10506       return Broadcast;
10507
10508     // Use even/odd duplicate instructions for masks that match their pattern.
10509     if (Subtarget.hasSSE3()) {
10510       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10511         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10512       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10513         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10514     }
10515
10516     if (Subtarget.hasAVX()) {
10517       // If we have AVX, we can use VPERMILPS which will allow folding a load
10518       // into the shuffle.
10519       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10520                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10521     }
10522
10523     // Otherwise, use a straight shuffle of a single input vector. We pass the
10524     // input vector to both operands to simulate this with a SHUFPS.
10525     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10526                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10527   }
10528
10529   // There are special ways we can lower some single-element blends. However, we
10530   // have custom ways we can lower more complex single-element blends below that
10531   // we defer to if both this and BLENDPS fail to match, so restrict this to
10532   // when the V2 input is targeting element 0 of the mask -- that is the fast
10533   // case here.
10534   if (NumV2Elements == 1 && Mask[0] >= 4)
10535     if (SDValue V = lowerVectorShuffleAsElementInsertion(
10536             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10537       return V;
10538
10539   if (Subtarget.hasSSE41()) {
10540     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10541                                                   Zeroable, Subtarget, DAG))
10542       return Blend;
10543
10544     // Use INSERTPS if we can complete the shuffle efficiently.
10545     if (SDValue V =
10546             lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10547       return V;
10548
10549     if (!isSingleSHUFPSMask(Mask))
10550       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10551               DL, MVT::v4f32, V1, V2, Mask, DAG))
10552         return BlendPerm;
10553   }
10554
10555   // Use low/high mov instructions.
10556   if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10557     return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10558   if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10559     return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10560
10561   // Use dedicated unpack instructions for masks that match their pattern.
10562   if (SDValue V =
10563           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10564     return V;
10565
10566   // Otherwise fall back to a SHUFPS lowering strategy.
10567   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10568 }
10569
10570 /// \brief Lower 4-lane i32 vector shuffles.
10571 ///
10572 /// We try to handle these with integer-domain shuffles where we can, but for
10573 /// blends we use the floating point domain blend instructions.
10574 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10575                                        const APInt &Zeroable,
10576                                        SDValue V1, SDValue V2,
10577                                        const X86Subtarget &Subtarget,
10578                                        SelectionDAG &DAG) {
10579   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10580   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10581   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10582
10583   // Whenever we can lower this as a zext, that instruction is strictly faster
10584   // than any alternative. It also allows us to fold memory operands into the
10585   // shuffle in many cases.
10586   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10587           DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10588     return ZExt;
10589
10590   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10591
10592   if (NumV2Elements == 0) {
10593     // Check for being able to broadcast a single element.
10594     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10595             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10596       return Broadcast;
10597
10598     // Straight shuffle of a single input vector. For everything from SSE2
10599     // onward this has a single fast instruction with no scary immediates.
10600     // We coerce the shuffle pattern to be compatible with UNPCK instructions
10601     // but we aren't actually going to use the UNPCK instruction because doing
10602     // so prevents folding a load into this instruction or making a copy.
10603     const int UnpackLoMask[] = {0, 0, 1, 1};
10604     const int UnpackHiMask[] = {2, 2, 3, 3};
10605     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10606       Mask = UnpackLoMask;
10607     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10608       Mask = UnpackHiMask;
10609
10610     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10611                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10612   }
10613
10614   // Try to use shift instructions.
10615   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10616                                                 Zeroable, Subtarget, DAG))
10617     return Shift;
10618
10619   // There are special ways we can lower some single-element blends.
10620   if (NumV2Elements == 1)
10621     if (SDValue V = lowerVectorShuffleAsElementInsertion(
10622             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10623       return V;
10624
10625   // We have different paths for blend lowering, but they all must use the
10626   // *exact* same predicate.
10627   bool IsBlendSupported = Subtarget.hasSSE41();
10628   if (IsBlendSupported)
10629     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10630                                                   Zeroable, Subtarget, DAG))
10631       return Blend;
10632
10633   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10634                                                    Zeroable, DAG))
10635     return Masked;
10636
10637   // Use dedicated unpack instructions for masks that match their pattern.
10638   if (SDValue V =
10639           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10640     return V;
10641
10642   // Try to use byte rotation instructions.
10643   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10644   if (Subtarget.hasSSSE3())
10645     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10646             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10647       return Rotate;
10648
10649   // Assume that a single SHUFPS is faster than an alternative sequence of
10650   // multiple instructions (even if the CPU has a domain penalty).
10651   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10652   if (!isSingleSHUFPSMask(Mask)) {
10653     // If we have direct support for blends, we should lower by decomposing into
10654     // a permute. That will be faster than the domain cross.
10655     if (IsBlendSupported)
10656       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10657                                                         Mask, DAG);
10658
10659     // Try to lower by permuting the inputs into an unpack instruction.
10660     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10661             DL, MVT::v4i32, V1, V2, Mask, DAG))
10662       return Unpack;
10663   }
10664
10665   // We implement this with SHUFPS because it can blend from two vectors.
10666   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10667   // up the inputs, bypassing domain shift penalties that we would incur if we
10668   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10669   // relevant.
10670   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10671   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10672   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10673   return DAG.getBitcast(MVT::v4i32, ShufPS);
10674 }
10675
10676 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10677 /// shuffle lowering, and the most complex part.
10678 ///
10679 /// The lowering strategy is to try to form pairs of input lanes which are
10680 /// targeted at the same half of the final vector, and then use a dword shuffle
10681 /// to place them onto the right half, and finally unpack the paired lanes into
10682 /// their final position.
10683 ///
10684 /// The exact breakdown of how to form these dword pairs and align them on the
10685 /// correct sides is really tricky. See the comments within the function for
10686 /// more of the details.
10687 ///
10688 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10689 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10690 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10691 /// vector, form the analogous 128-bit 8-element Mask.
10692 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10693     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10694     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10695   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10696   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10697
10698   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10699   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10700   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10701
10702   SmallVector<int, 4> LoInputs;
10703   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10704   std::sort(LoInputs.begin(), LoInputs.end());
10705   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10706   SmallVector<int, 4> HiInputs;
10707   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10708   std::sort(HiInputs.begin(), HiInputs.end());
10709   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10710   int NumLToL =
10711       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10712   int NumHToL = LoInputs.size() - NumLToL;
10713   int NumLToH =
10714       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10715   int NumHToH = HiInputs.size() - NumLToH;
10716   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10717   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10718   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10719   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10720
10721   // If we are splatting two values from one half - one to each half, then
10722   // we can shuffle that half so each is splatted to a dword, then splat those
10723   // to their respective halves.
10724   auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10725                         int DOffset) {
10726     int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10727     int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10728     V = DAG.getNode(ShufWOp, DL, VT, V,
10729                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10730     V = DAG.getBitcast(PSHUFDVT, V);
10731     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10732                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10733     return DAG.getBitcast(VT, V);
10734   };
10735
10736   if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10737     return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10738   if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10739     return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10740
10741   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10742   // such inputs we can swap two of the dwords across the half mark and end up
10743   // with <=2 inputs to each half in each half. Once there, we can fall through
10744   // to the generic code below. For example:
10745   //
10746   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10747   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10748   //
10749   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10750   // and an existing 2-into-2 on the other half. In this case we may have to
10751   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10752   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10753   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10754   // because any other situation (including a 3-into-1 or 1-into-3 in the other
10755   // half than the one we target for fixing) will be fixed when we re-enter this
10756   // path. We will also combine away any sequence of PSHUFD instructions that
10757   // result into a single instruction. Here is an example of the tricky case:
10758   //
10759   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10760   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10761   //
10762   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10763   //
10764   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10765   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10766   //
10767   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10768   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10769   //
10770   // The result is fine to be handled by the generic logic.
10771   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10772                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10773                           int AOffset, int BOffset) {
10774     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10775            "Must call this with A having 3 or 1 inputs from the A half.");
10776     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10777            "Must call this with B having 1 or 3 inputs from the B half.");
10778     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10779            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10780
10781     bool ThreeAInputs = AToAInputs.size() == 3;
10782
10783     // Compute the index of dword with only one word among the three inputs in
10784     // a half by taking the sum of the half with three inputs and subtracting
10785     // the sum of the actual three inputs. The difference is the remaining
10786     // slot.
10787     int ADWord, BDWord;
10788     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10789     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10790     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10791     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10792     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10793     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10794     int TripleNonInputIdx =
10795         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10796     TripleDWord = TripleNonInputIdx / 2;
10797
10798     // We use xor with one to compute the adjacent DWord to whichever one the
10799     // OneInput is in.
10800     OneInputDWord = (OneInput / 2) ^ 1;
10801
10802     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10803     // and BToA inputs. If there is also such a problem with the BToB and AToB
10804     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10805     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10806     // is essential that we don't *create* a 3<-1 as then we might oscillate.
10807     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10808       // Compute how many inputs will be flipped by swapping these DWords. We
10809       // need
10810       // to balance this to ensure we don't form a 3-1 shuffle in the other
10811       // half.
10812       int NumFlippedAToBInputs =
10813           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10814           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10815       int NumFlippedBToBInputs =
10816           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10817           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10818       if ((NumFlippedAToBInputs == 1 &&
10819            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10820           (NumFlippedBToBInputs == 1 &&
10821            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10822         // We choose whether to fix the A half or B half based on whether that
10823         // half has zero flipped inputs. At zero, we may not be able to fix it
10824         // with that half. We also bias towards fixing the B half because that
10825         // will more commonly be the high half, and we have to bias one way.
10826         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10827                                                        ArrayRef<int> Inputs) {
10828           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10829           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10830           // Determine whether the free index is in the flipped dword or the
10831           // unflipped dword based on where the pinned index is. We use this bit
10832           // in an xor to conditionally select the adjacent dword.
10833           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10834           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10835           if (IsFixIdxInput == IsFixFreeIdxInput)
10836             FixFreeIdx += 1;
10837           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10838           assert(IsFixIdxInput != IsFixFreeIdxInput &&
10839                  "We need to be changing the number of flipped inputs!");
10840           int PSHUFHalfMask[] = {0, 1, 2, 3};
10841           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10842           V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10843                           MVT::v8i16, V,
10844                           getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10845
10846           for (int &M : Mask)
10847             if (M >= 0 && M == FixIdx)
10848               M = FixFreeIdx;
10849             else if (M >= 0 && M == FixFreeIdx)
10850               M = FixIdx;
10851         };
10852         if (NumFlippedBToBInputs != 0) {
10853           int BPinnedIdx =
10854               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10855           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10856         } else {
10857           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10858           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10859           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10860         }
10861       }
10862     }
10863
10864     int PSHUFDMask[] = {0, 1, 2, 3};
10865     PSHUFDMask[ADWord] = BDWord;
10866     PSHUFDMask[BDWord] = ADWord;
10867     V = DAG.getBitcast(
10868         VT,
10869         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10870                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10871
10872     // Adjust the mask to match the new locations of A and B.
10873     for (int &M : Mask)
10874       if (M >= 0 && M/2 == ADWord)
10875         M = 2 * BDWord + M % 2;
10876       else if (M >= 0 && M/2 == BDWord)
10877         M = 2 * ADWord + M % 2;
10878
10879     // Recurse back into this routine to re-compute state now that this isn't
10880     // a 3 and 1 problem.
10881     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10882                                                      DAG);
10883   };
10884   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10885     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10886   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10887     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10888
10889   // At this point there are at most two inputs to the low and high halves from
10890   // each half. That means the inputs can always be grouped into dwords and
10891   // those dwords can then be moved to the correct half with a dword shuffle.
10892   // We use at most one low and one high word shuffle to collect these paired
10893   // inputs into dwords, and finally a dword shuffle to place them.
10894   int PSHUFLMask[4] = {-1, -1, -1, -1};
10895   int PSHUFHMask[4] = {-1, -1, -1, -1};
10896   int PSHUFDMask[4] = {-1, -1, -1, -1};
10897
10898   // First fix the masks for all the inputs that are staying in their
10899   // original halves. This will then dictate the targets of the cross-half
10900   // shuffles.
10901   auto fixInPlaceInputs =
10902       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10903                     MutableArrayRef<int> SourceHalfMask,
10904                     MutableArrayRef<int> HalfMask, int HalfOffset) {
10905     if (InPlaceInputs.empty())
10906       return;
10907     if (InPlaceInputs.size() == 1) {
10908       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10909           InPlaceInputs[0] - HalfOffset;
10910       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10911       return;
10912     }
10913     if (IncomingInputs.empty()) {
10914       // Just fix all of the in place inputs.
10915       for (int Input : InPlaceInputs) {
10916         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10917         PSHUFDMask[Input / 2] = Input / 2;
10918       }
10919       return;
10920     }
10921
10922     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10923     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10924         InPlaceInputs[0] - HalfOffset;
10925     // Put the second input next to the first so that they are packed into
10926     // a dword. We find the adjacent index by toggling the low bit.
10927     int AdjIndex = InPlaceInputs[0] ^ 1;
10928     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10929     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10930     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10931   };
10932   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10933   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10934
10935   // Now gather the cross-half inputs and place them into a free dword of
10936   // their target half.
10937   // FIXME: This operation could almost certainly be simplified dramatically to
10938   // look more like the 3-1 fixing operation.
10939   auto moveInputsToRightHalf = [&PSHUFDMask](
10940       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10941       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10942       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10943       int DestOffset) {
10944     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10945       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10946     };
10947     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10948                                                int Word) {
10949       int LowWord = Word & ~1;
10950       int HighWord = Word | 1;
10951       return isWordClobbered(SourceHalfMask, LowWord) ||
10952              isWordClobbered(SourceHalfMask, HighWord);
10953     };
10954
10955     if (IncomingInputs.empty())
10956       return;
10957
10958     if (ExistingInputs.empty()) {
10959       // Map any dwords with inputs from them into the right half.
10960       for (int Input : IncomingInputs) {
10961         // If the source half mask maps over the inputs, turn those into
10962         // swaps and use the swapped lane.
10963         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10964           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10965             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10966                 Input - SourceOffset;
10967             // We have to swap the uses in our half mask in one sweep.
10968             for (int &M : HalfMask)
10969               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10970                 M = Input;
10971               else if (M == Input)
10972                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10973           } else {
10974             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10975                        Input - SourceOffset &&
10976                    "Previous placement doesn't match!");
10977           }
10978           // Note that this correctly re-maps both when we do a swap and when
10979           // we observe the other side of the swap above. We rely on that to
10980           // avoid swapping the members of the input list directly.
10981           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10982         }
10983
10984         // Map the input's dword into the correct half.
10985         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10986           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10987         else
10988           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10989                      Input / 2 &&
10990                  "Previous placement doesn't match!");
10991       }
10992
10993       // And just directly shift any other-half mask elements to be same-half
10994       // as we will have mirrored the dword containing the element into the
10995       // same position within that half.
10996       for (int &M : HalfMask)
10997         if (M >= SourceOffset && M < SourceOffset + 4) {
10998           M = M - SourceOffset + DestOffset;
10999           assert(M >= 0 && "This should never wrap below zero!");
11000         }
11001       return;
11002     }
11003
11004     // Ensure we have the input in a viable dword of its current half. This
11005     // is particularly tricky because the original position may be clobbered
11006     // by inputs being moved and *staying* in that half.
11007     if (IncomingInputs.size() == 1) {
11008       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11009         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11010                          SourceOffset;
11011         SourceHalfMask[InputFixed - SourceOffset] =
11012             IncomingInputs[0] - SourceOffset;
11013         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11014                      InputFixed);
11015         IncomingInputs[0] = InputFixed;
11016       }
11017     } else if (IncomingInputs.size() == 2) {
11018       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11019           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11020         // We have two non-adjacent or clobbered inputs we need to extract from
11021         // the source half. To do this, we need to map them into some adjacent
11022         // dword slot in the source mask.
11023         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11024                               IncomingInputs[1] - SourceOffset};
11025
11026         // If there is a free slot in the source half mask adjacent to one of
11027         // the inputs, place the other input in it. We use (Index XOR 1) to
11028         // compute an adjacent index.
11029         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11030             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11031           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11032           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11033           InputsFixed[1] = InputsFixed[0] ^ 1;
11034         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11035                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11036           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11037           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11038           InputsFixed[0] = InputsFixed[1] ^ 1;
11039         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11040                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11041           // The two inputs are in the same DWord but it is clobbered and the
11042           // adjacent DWord isn't used at all. Move both inputs to the free
11043           // slot.
11044           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11045           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11046           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11047           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11048         } else {
11049           // The only way we hit this point is if there is no clobbering
11050           // (because there are no off-half inputs to this half) and there is no
11051           // free slot adjacent to one of the inputs. In this case, we have to
11052           // swap an input with a non-input.
11053           for (int i = 0; i < 4; ++i)
11054             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11055                    "We can't handle any clobbers here!");
11056           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11057                  "Cannot have adjacent inputs here!");
11058
11059           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11060           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11061
11062           // We also have to update the final source mask in this case because
11063           // it may need to undo the above swap.
11064           for (int &M : FinalSourceHalfMask)
11065             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11066               M = InputsFixed[1] + SourceOffset;
11067             else if (M == InputsFixed[1] + SourceOffset)
11068               M = (InputsFixed[0] ^ 1) + SourceOffset;
11069
11070           InputsFixed[1] = InputsFixed[0] ^ 1;
11071         }
11072
11073         // Point everything at the fixed inputs.
11074         for (int &M : HalfMask)
11075           if (M == IncomingInputs[0])
11076             M = InputsFixed[0] + SourceOffset;
11077           else if (M == IncomingInputs[1])
11078             M = InputsFixed[1] + SourceOffset;
11079
11080         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11081         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11082       }
11083     } else {
11084       llvm_unreachable("Unhandled input size!");
11085     }
11086
11087     // Now hoist the DWord down to the right half.
11088     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11089     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11090     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11091     for (int &M : HalfMask)
11092       for (int Input : IncomingInputs)
11093         if (M == Input)
11094           M = FreeDWord * 2 + Input % 2;
11095   };
11096   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11097                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
11098   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11099                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
11100
11101   // Now enact all the shuffles we've computed to move the inputs into their
11102   // target half.
11103   if (!isNoopShuffleMask(PSHUFLMask))
11104     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11105                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11106   if (!isNoopShuffleMask(PSHUFHMask))
11107     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11108                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11109   if (!isNoopShuffleMask(PSHUFDMask))
11110     V = DAG.getBitcast(
11111         VT,
11112         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11113                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11114
11115   // At this point, each half should contain all its inputs, and we can then
11116   // just shuffle them into their final position.
11117   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11118          "Failed to lift all the high half inputs to the low mask!");
11119   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11120          "Failed to lift all the low half inputs to the high mask!");
11121
11122   // Do a half shuffle for the low mask.
11123   if (!isNoopShuffleMask(LoMask))
11124     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11125                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11126
11127   // Do a half shuffle with the high mask after shifting its values down.
11128   for (int &M : HiMask)
11129     if (M >= 0)
11130       M -= 4;
11131   if (!isNoopShuffleMask(HiMask))
11132     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11133                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11134
11135   return V;
11136 }
11137
11138 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11139 /// blend if only one input is used.
11140 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11141     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11142     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11143     bool &V2InUse) {
11144   SDValue V1Mask[16];
11145   SDValue V2Mask[16];
11146   V1InUse = false;
11147   V2InUse = false;
11148
11149   int Size = Mask.size();
11150   int Scale = 16 / Size;
11151   for (int i = 0; i < 16; ++i) {
11152     if (Mask[i / Scale] < 0) {
11153       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11154     } else {
11155       const int ZeroMask = 0x80;
11156       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11157                                           : ZeroMask;
11158       int V2Idx = Mask[i / Scale] < Size
11159                       ? ZeroMask
11160                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
11161       if (Zeroable[i / Scale])
11162         V1Idx = V2Idx = ZeroMask;
11163       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11164       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11165       V1InUse |= (ZeroMask != V1Idx);
11166       V2InUse |= (ZeroMask != V2Idx);
11167     }
11168   }
11169
11170   if (V1InUse)
11171     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11172                      DAG.getBitcast(MVT::v16i8, V1),
11173                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11174   if (V2InUse)
11175     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11176                      DAG.getBitcast(MVT::v16i8, V2),
11177                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11178
11179   // If we need shuffled inputs from both, blend the two.
11180   SDValue V;
11181   if (V1InUse && V2InUse)
11182     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11183   else
11184     V = V1InUse ? V1 : V2;
11185
11186   // Cast the result back to the correct type.
11187   return DAG.getBitcast(VT, V);
11188 }
11189
11190 /// \brief Generic lowering of 8-lane i16 shuffles.
11191 ///
11192 /// This handles both single-input shuffles and combined shuffle/blends with
11193 /// two inputs. The single input shuffles are immediately delegated to
11194 /// a dedicated lowering routine.
11195 ///
11196 /// The blends are lowered in one of three fundamental ways. If there are few
11197 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11198 /// of the input is significantly cheaper when lowered as an interleaving of
11199 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11200 /// halves of the inputs separately (making them have relatively few inputs)
11201 /// and then concatenate them.
11202 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11203                                        const APInt &Zeroable,
11204                                        SDValue V1, SDValue V2,
11205                                        const X86Subtarget &Subtarget,
11206                                        SelectionDAG &DAG) {
11207   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11208   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11209   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11210
11211   // Whenever we can lower this as a zext, that instruction is strictly faster
11212   // than any alternative.
11213   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11214           DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11215     return ZExt;
11216
11217   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11218
11219   if (NumV2Inputs == 0) {
11220     // Check for being able to broadcast a single element.
11221     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11222             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11223       return Broadcast;
11224
11225     // Try to use shift instructions.
11226     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11227                                                   Zeroable, Subtarget, DAG))
11228       return Shift;
11229
11230     // Use dedicated unpack instructions for masks that match their pattern.
11231     if (SDValue V =
11232             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11233       return V;
11234
11235     // Try to use byte rotation instructions.
11236     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11237                                                         Mask, Subtarget, DAG))
11238       return Rotate;
11239
11240     // Make a copy of the mask so it can be modified.
11241     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11242     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11243                                                      MutableMask, Subtarget,
11244                                                      DAG);
11245   }
11246
11247   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11248          "All single-input shuffles should be canonicalized to be V1-input "
11249          "shuffles.");
11250
11251   // Try to use shift instructions.
11252   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11253                                                 Zeroable, Subtarget, DAG))
11254     return Shift;
11255
11256   // See if we can use SSE4A Extraction / Insertion.
11257   if (Subtarget.hasSSE4A())
11258     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11259                                                 Zeroable, DAG))
11260       return V;
11261
11262   // There are special ways we can lower some single-element blends.
11263   if (NumV2Inputs == 1)
11264     if (SDValue V = lowerVectorShuffleAsElementInsertion(
11265             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11266       return V;
11267
11268   // We have different paths for blend lowering, but they all must use the
11269   // *exact* same predicate.
11270   bool IsBlendSupported = Subtarget.hasSSE41();
11271   if (IsBlendSupported)
11272     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11273                                                   Zeroable, Subtarget, DAG))
11274       return Blend;
11275
11276   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11277                                                    Zeroable, DAG))
11278     return Masked;
11279
11280   // Use dedicated unpack instructions for masks that match their pattern.
11281   if (SDValue V =
11282           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11283     return V;
11284
11285   // Try to use byte rotation instructions.
11286   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11287           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11288     return Rotate;
11289
11290   if (SDValue BitBlend =
11291           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11292     return BitBlend;
11293
11294   // Try to lower by permuting the inputs into an unpack instruction.
11295   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11296                                                             V2, Mask, DAG))
11297     return Unpack;
11298
11299   // If we can't directly blend but can use PSHUFB, that will be better as it
11300   // can both shuffle and set up the inefficient blend.
11301   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11302     bool V1InUse, V2InUse;
11303     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11304                                               Zeroable, DAG, V1InUse, V2InUse);
11305   }
11306
11307   // We can always bit-blend if we have to so the fallback strategy is to
11308   // decompose into single-input permutes and blends.
11309   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11310                                                     Mask, DAG);
11311 }
11312
11313 /// \brief Check whether a compaction lowering can be done by dropping even
11314 /// elements and compute how many times even elements must be dropped.
11315 ///
11316 /// This handles shuffles which take every Nth element where N is a power of
11317 /// two. Example shuffle masks:
11318 ///
11319 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11320 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11321 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11322 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11323 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11324 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11325 ///
11326 /// Any of these lanes can of course be undef.
11327 ///
11328 /// This routine only supports N <= 3.
11329 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11330 /// for larger N.
11331 ///
11332 /// \returns N above, or the number of times even elements must be dropped if
11333 /// there is such a number. Otherwise returns zero.
11334 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11335                                           bool IsSingleInput) {
11336   // The modulus for the shuffle vector entries is based on whether this is
11337   // a single input or not.
11338   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11339   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11340          "We should only be called with masks with a power-of-2 size!");
11341
11342   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11343
11344   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11345   // and 2^3 simultaneously. This is because we may have ambiguity with
11346   // partially undef inputs.
11347   bool ViableForN[3] = {true, true, true};
11348
11349   for (int i = 0, e = Mask.size(); i < e; ++i) {
11350     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11351     // want.
11352     if (Mask[i] < 0)
11353       continue;
11354
11355     bool IsAnyViable = false;
11356     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11357       if (ViableForN[j]) {
11358         uint64_t N = j + 1;
11359
11360         // The shuffle mask must be equal to (i * 2^N) % M.
11361         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11362           IsAnyViable = true;
11363         else
11364           ViableForN[j] = false;
11365       }
11366     // Early exit if we exhaust the possible powers of two.
11367     if (!IsAnyViable)
11368       break;
11369   }
11370
11371   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11372     if (ViableForN[j])
11373       return j + 1;
11374
11375   // Return 0 as there is no viable power of two.
11376   return 0;
11377 }
11378
11379 /// \brief Generic lowering of v16i8 shuffles.
11380 ///
11381 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11382 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11383 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11384 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11385 /// back together.
11386 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11387                                        const APInt &Zeroable,
11388                                        SDValue V1, SDValue V2,
11389                                        const X86Subtarget &Subtarget,
11390                                        SelectionDAG &DAG) {
11391   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11392   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11393   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11394
11395   // Try to use shift instructions.
11396   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11397                                                 Zeroable, Subtarget, DAG))
11398     return Shift;
11399
11400   // Try to use byte rotation instructions.
11401   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11402           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11403     return Rotate;
11404
11405   // Try to use a zext lowering.
11406   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11407           DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11408     return ZExt;
11409
11410   // See if we can use SSE4A Extraction / Insertion.
11411   if (Subtarget.hasSSE4A())
11412     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11413                                                 Zeroable, DAG))
11414       return V;
11415
11416   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11417
11418   // For single-input shuffles, there are some nicer lowering tricks we can use.
11419   if (NumV2Elements == 0) {
11420     // Check for being able to broadcast a single element.
11421     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11422             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11423       return Broadcast;
11424
11425     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11426     // Notably, this handles splat and partial-splat shuffles more efficiently.
11427     // However, it only makes sense if the pre-duplication shuffle simplifies
11428     // things significantly. Currently, this means we need to be able to
11429     // express the pre-duplication shuffle as an i16 shuffle.
11430     //
11431     // FIXME: We should check for other patterns which can be widened into an
11432     // i16 shuffle as well.
11433     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11434       for (int i = 0; i < 16; i += 2)
11435         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11436           return false;
11437
11438       return true;
11439     };
11440     auto tryToWidenViaDuplication = [&]() -> SDValue {
11441       if (!canWidenViaDuplication(Mask))
11442         return SDValue();
11443       SmallVector<int, 4> LoInputs;
11444       copy_if(Mask, std::back_inserter(LoInputs),
11445               [](int M) { return M >= 0 && M < 8; });
11446       std::sort(LoInputs.begin(), LoInputs.end());
11447       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11448                      LoInputs.end());
11449       SmallVector<int, 4> HiInputs;
11450       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11451       std::sort(HiInputs.begin(), HiInputs.end());
11452       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11453                      HiInputs.end());
11454
11455       bool TargetLo = LoInputs.size() >= HiInputs.size();
11456       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11457       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11458
11459       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11460       SmallDenseMap<int, int, 8> LaneMap;
11461       for (int I : InPlaceInputs) {
11462         PreDupI16Shuffle[I/2] = I/2;
11463         LaneMap[I] = I;
11464       }
11465       int j = TargetLo ? 0 : 4, je = j + 4;
11466       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11467         // Check if j is already a shuffle of this input. This happens when
11468         // there are two adjacent bytes after we move the low one.
11469         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11470           // If we haven't yet mapped the input, search for a slot into which
11471           // we can map it.
11472           while (j < je && PreDupI16Shuffle[j] >= 0)
11473             ++j;
11474
11475           if (j == je)
11476             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11477             return SDValue();
11478
11479           // Map this input with the i16 shuffle.
11480           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11481         }
11482
11483         // Update the lane map based on the mapping we ended up with.
11484         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11485       }
11486       V1 = DAG.getBitcast(
11487           MVT::v16i8,
11488           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11489                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11490
11491       // Unpack the bytes to form the i16s that will be shuffled into place.
11492       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11493                        MVT::v16i8, V1, V1);
11494
11495       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11496       for (int i = 0; i < 16; ++i)
11497         if (Mask[i] >= 0) {
11498           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11499           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11500           if (PostDupI16Shuffle[i / 2] < 0)
11501             PostDupI16Shuffle[i / 2] = MappedMask;
11502           else
11503             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11504                    "Conflicting entries in the original shuffle!");
11505         }
11506       return DAG.getBitcast(
11507           MVT::v16i8,
11508           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11509                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11510     };
11511     if (SDValue V = tryToWidenViaDuplication())
11512       return V;
11513   }
11514
11515   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11516                                                    Zeroable, DAG))
11517     return Masked;
11518
11519   // Use dedicated unpack instructions for masks that match their pattern.
11520   if (SDValue V =
11521           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11522     return V;
11523
11524   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11525   // with PSHUFB. It is important to do this before we attempt to generate any
11526   // blends but after all of the single-input lowerings. If the single input
11527   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11528   // want to preserve that and we can DAG combine any longer sequences into
11529   // a PSHUFB in the end. But once we start blending from multiple inputs,
11530   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11531   // and there are *very* few patterns that would actually be faster than the
11532   // PSHUFB approach because of its ability to zero lanes.
11533   //
11534   // FIXME: The only exceptions to the above are blends which are exact
11535   // interleavings with direct instructions supporting them. We currently don't
11536   // handle those well here.
11537   if (Subtarget.hasSSSE3()) {
11538     bool V1InUse = false;
11539     bool V2InUse = false;
11540
11541     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11542         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11543
11544     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11545     // do so. This avoids using them to handle blends-with-zero which is
11546     // important as a single pshufb is significantly faster for that.
11547     if (V1InUse && V2InUse) {
11548       if (Subtarget.hasSSE41())
11549         if (SDValue Blend = lowerVectorShuffleAsBlend(
11550                 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11551           return Blend;
11552
11553       // We can use an unpack to do the blending rather than an or in some
11554       // cases. Even though the or may be (very minorly) more efficient, we
11555       // preference this lowering because there are common cases where part of
11556       // the complexity of the shuffles goes away when we do the final blend as
11557       // an unpack.
11558       // FIXME: It might be worth trying to detect if the unpack-feeding
11559       // shuffles will both be pshufb, in which case we shouldn't bother with
11560       // this.
11561       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11562               DL, MVT::v16i8, V1, V2, Mask, DAG))
11563         return Unpack;
11564     }
11565
11566     return PSHUFB;
11567   }
11568
11569   // There are special ways we can lower some single-element blends.
11570   if (NumV2Elements == 1)
11571     if (SDValue V = lowerVectorShuffleAsElementInsertion(
11572             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11573       return V;
11574
11575   if (SDValue BitBlend =
11576           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11577     return BitBlend;
11578
11579   // Check whether a compaction lowering can be done. This handles shuffles
11580   // which take every Nth element for some even N. See the helper function for
11581   // details.
11582   //
11583   // We special case these as they can be particularly efficiently handled with
11584   // the PACKUSB instruction on x86 and they show up in common patterns of
11585   // rearranging bytes to truncate wide elements.
11586   bool IsSingleInput = V2.isUndef();
11587   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11588     // NumEvenDrops is the power of two stride of the elements. Another way of
11589     // thinking about it is that we need to drop the even elements this many
11590     // times to get the original input.
11591
11592     // First we need to zero all the dropped bytes.
11593     assert(NumEvenDrops <= 3 &&
11594            "No support for dropping even elements more than 3 times.");
11595     // We use the mask type to pick which bytes are preserved based on how many
11596     // elements are dropped.
11597     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11598     SDValue ByteClearMask = DAG.getBitcast(
11599         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11600     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11601     if (!IsSingleInput)
11602       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11603
11604     // Now pack things back together.
11605     V1 = DAG.getBitcast(MVT::v8i16, V1);
11606     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11607     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11608     for (int i = 1; i < NumEvenDrops; ++i) {
11609       Result = DAG.getBitcast(MVT::v8i16, Result);
11610       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11611     }
11612
11613     return Result;
11614   }
11615
11616   // Handle multi-input cases by blending single-input shuffles.
11617   if (NumV2Elements > 0)
11618     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11619                                                       Mask, DAG);
11620
11621   // The fallback path for single-input shuffles widens this into two v8i16
11622   // vectors with unpacks, shuffles those, and then pulls them back together
11623   // with a pack.
11624   SDValue V = V1;
11625
11626   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11627   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11628   for (int i = 0; i < 16; ++i)
11629     if (Mask[i] >= 0)
11630       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11631
11632   SDValue VLoHalf, VHiHalf;
11633   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11634   // them out and avoid using UNPCK{L,H} to extract the elements of V as
11635   // i16s.
11636   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11637       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11638     // Use a mask to drop the high bytes.
11639     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11640     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11641                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
11642
11643     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11644     VHiHalf = DAG.getUNDEF(MVT::v8i16);
11645
11646     // Squash the masks to point directly into VLoHalf.
11647     for (int &M : LoBlendMask)
11648       if (M >= 0)
11649         M /= 2;
11650     for (int &M : HiBlendMask)
11651       if (M >= 0)
11652         M /= 2;
11653   } else {
11654     // Otherwise just unpack the low half of V into VLoHalf and the high half into
11655     // VHiHalf so that we can blend them as i16s.
11656     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11657
11658     VLoHalf = DAG.getBitcast(
11659         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11660     VHiHalf = DAG.getBitcast(
11661         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11662   }
11663
11664   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11665   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11666
11667   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11668 }
11669
11670 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11671 ///
11672 /// This routine breaks down the specific type of 128-bit shuffle and
11673 /// dispatches to the lowering routines accordingly.
11674 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11675                                         MVT VT, SDValue V1, SDValue V2,
11676                                         const APInt &Zeroable,
11677                                         const X86Subtarget &Subtarget,
11678                                         SelectionDAG &DAG) {
11679   switch (VT.SimpleTy) {
11680   case MVT::v2i64:
11681     return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11682   case MVT::v2f64:
11683     return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11684   case MVT::v4i32:
11685     return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11686   case MVT::v4f32:
11687     return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11688   case MVT::v8i16:
11689     return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11690   case MVT::v16i8:
11691     return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11692
11693   default:
11694     llvm_unreachable("Unimplemented!");
11695   }
11696 }
11697
11698 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11699 ///
11700 /// This routine just extracts two subvectors, shuffles them independently, and
11701 /// then concatenates them back together. This should work effectively with all
11702 /// AVX vector shuffle types.
11703 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11704                                           SDValue V2, ArrayRef<int> Mask,
11705                                           SelectionDAG &DAG) {
11706   assert(VT.getSizeInBits() >= 256 &&
11707          "Only for 256-bit or wider vector shuffles!");
11708   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11709   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11710
11711   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11712   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11713
11714   int NumElements = VT.getVectorNumElements();
11715   int SplitNumElements = NumElements / 2;
11716   MVT ScalarVT = VT.getVectorElementType();
11717   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11718
11719   // Rather than splitting build-vectors, just build two narrower build
11720   // vectors. This helps shuffling with splats and zeros.
11721   auto SplitVector = [&](SDValue V) {
11722     V = peekThroughBitcasts(V);
11723
11724     MVT OrigVT = V.getSimpleValueType();
11725     int OrigNumElements = OrigVT.getVectorNumElements();
11726     int OrigSplitNumElements = OrigNumElements / 2;
11727     MVT OrigScalarVT = OrigVT.getVectorElementType();
11728     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11729
11730     SDValue LoV, HiV;
11731
11732     auto *BV = dyn_cast<BuildVectorSDNode>(V);
11733     if (!BV) {
11734       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11735                         DAG.getIntPtrConstant(0, DL));
11736       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11737                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11738     } else {
11739
11740       SmallVector<SDValue, 16> LoOps, HiOps;
11741       for (int i = 0; i < OrigSplitNumElements; ++i) {
11742         LoOps.push_back(BV->getOperand(i));
11743         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11744       }
11745       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11746       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11747     }
11748     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11749                           DAG.getBitcast(SplitVT, HiV));
11750   };
11751
11752   SDValue LoV1, HiV1, LoV2, HiV2;
11753   std::tie(LoV1, HiV1) = SplitVector(V1);
11754   std::tie(LoV2, HiV2) = SplitVector(V2);
11755
11756   // Now create two 4-way blends of these half-width vectors.
11757   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11758     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11759     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11760     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11761     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11762     for (int i = 0; i < SplitNumElements; ++i) {
11763       int M = HalfMask[i];
11764       if (M >= NumElements) {
11765         if (M >= NumElements + SplitNumElements)
11766           UseHiV2 = true;
11767         else
11768           UseLoV2 = true;
11769         V2BlendMask[i] = M - NumElements;
11770         BlendMask[i] = SplitNumElements + i;
11771       } else if (M >= 0) {
11772         if (M >= SplitNumElements)
11773           UseHiV1 = true;
11774         else
11775           UseLoV1 = true;
11776         V1BlendMask[i] = M;
11777         BlendMask[i] = i;
11778       }
11779     }
11780
11781     // Because the lowering happens after all combining takes place, we need to
11782     // manually combine these blend masks as much as possible so that we create
11783     // a minimal number of high-level vector shuffle nodes.
11784
11785     // First try just blending the halves of V1 or V2.
11786     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11787       return DAG.getUNDEF(SplitVT);
11788     if (!UseLoV2 && !UseHiV2)
11789       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11790     if (!UseLoV1 && !UseHiV1)
11791       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11792
11793     SDValue V1Blend, V2Blend;
11794     if (UseLoV1 && UseHiV1) {
11795       V1Blend =
11796         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11797     } else {
11798       // We only use half of V1 so map the usage down into the final blend mask.
11799       V1Blend = UseLoV1 ? LoV1 : HiV1;
11800       for (int i = 0; i < SplitNumElements; ++i)
11801         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11802           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11803     }
11804     if (UseLoV2 && UseHiV2) {
11805       V2Blend =
11806         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11807     } else {
11808       // We only use half of V2 so map the usage down into the final blend mask.
11809       V2Blend = UseLoV2 ? LoV2 : HiV2;
11810       for (int i = 0; i < SplitNumElements; ++i)
11811         if (BlendMask[i] >= SplitNumElements)
11812           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11813     }
11814     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11815   };
11816   SDValue Lo = HalfBlend(LoMask);
11817   SDValue Hi = HalfBlend(HiMask);
11818   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11819 }
11820
11821 /// \brief Either split a vector in halves or decompose the shuffles and the
11822 /// blend.
11823 ///
11824 /// This is provided as a good fallback for many lowerings of non-single-input
11825 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11826 /// between splitting the shuffle into 128-bit components and stitching those
11827 /// back together vs. extracting the single-input shuffles and blending those
11828 /// results.
11829 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11830                                                 SDValue V1, SDValue V2,
11831                                                 ArrayRef<int> Mask,
11832                                                 SelectionDAG &DAG) {
11833   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11834          "shuffles as it could then recurse on itself.");
11835   int Size = Mask.size();
11836
11837   // If this can be modeled as a broadcast of two elements followed by a blend,
11838   // prefer that lowering. This is especially important because broadcasts can
11839   // often fold with memory operands.
11840   auto DoBothBroadcast = [&] {
11841     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11842     for (int M : Mask)
11843       if (M >= Size) {
11844         if (V2BroadcastIdx < 0)
11845           V2BroadcastIdx = M - Size;
11846         else if (M - Size != V2BroadcastIdx)
11847           return false;
11848       } else if (M >= 0) {
11849         if (V1BroadcastIdx < 0)
11850           V1BroadcastIdx = M;
11851         else if (M != V1BroadcastIdx)
11852           return false;
11853       }
11854     return true;
11855   };
11856   if (DoBothBroadcast())
11857     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11858                                                       DAG);
11859
11860   // If the inputs all stem from a single 128-bit lane of each input, then we
11861   // split them rather than blending because the split will decompose to
11862   // unusually few instructions.
11863   int LaneCount = VT.getSizeInBits() / 128;
11864   int LaneSize = Size / LaneCount;
11865   SmallBitVector LaneInputs[2];
11866   LaneInputs[0].resize(LaneCount, false);
11867   LaneInputs[1].resize(LaneCount, false);
11868   for (int i = 0; i < Size; ++i)
11869     if (Mask[i] >= 0)
11870       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11871   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11872     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11873
11874   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11875   // that the decomposed single-input shuffles don't end up here.
11876   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11877 }
11878
11879 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11880 /// a permutation and blend of those lanes.
11881 ///
11882 /// This essentially blends the out-of-lane inputs to each lane into the lane
11883 /// from a permuted copy of the vector. This lowering strategy results in four
11884 /// instructions in the worst case for a single-input cross lane shuffle which
11885 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11886 /// of. Special cases for each particular shuffle pattern should be handled
11887 /// prior to trying this lowering.
11888 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11889                                                        SDValue V1, SDValue V2,
11890                                                        ArrayRef<int> Mask,
11891                                                        SelectionDAG &DAG) {
11892   // FIXME: This should probably be generalized for 512-bit vectors as well.
11893   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11894   int Size = Mask.size();
11895   int LaneSize = Size / 2;
11896
11897   // If there are only inputs from one 128-bit lane, splitting will in fact be
11898   // less expensive. The flags track whether the given lane contains an element
11899   // that crosses to another lane.
11900   bool LaneCrossing[2] = {false, false};
11901   for (int i = 0; i < Size; ++i)
11902     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11903       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11904   if (!LaneCrossing[0] || !LaneCrossing[1])
11905     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11906
11907   assert(V2.isUndef() &&
11908          "This last part of this routine only works on single input shuffles");
11909
11910   SmallVector<int, 32> FlippedBlendMask(Size);
11911   for (int i = 0; i < Size; ++i)
11912     FlippedBlendMask[i] =
11913         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11914                                 ? Mask[i]
11915                                 : Mask[i] % LaneSize +
11916                                       (i / LaneSize) * LaneSize + Size);
11917
11918   // Flip the vector, and blend the results which should now be in-lane. The
11919   // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11920   // 5 for the high source. The value 3 selects the high half of source 2 and
11921   // the value 2 selects the low half of source 2. We only use source 2 to
11922   // allow folding it into a memory operand.
11923   unsigned PERMMask = 3 | 2 << 4;
11924   SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11925                                 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11926   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11927 }
11928
11929 /// \brief Handle lowering 2-lane 128-bit shuffles.
11930 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11931                                         SDValue V2, ArrayRef<int> Mask,
11932                                         const APInt &Zeroable,
11933                                         const X86Subtarget &Subtarget,
11934                                         SelectionDAG &DAG) {
11935   SmallVector<int, 4> WidenedMask;
11936   if (!canWidenShuffleElements(Mask, WidenedMask))
11937     return SDValue();
11938
11939   // TODO: If minimizing size and one of the inputs is a zero vector and the
11940   // the zero vector has only one use, we could use a VPERM2X128 to save the
11941   // instruction bytes needed to explicitly generate the zero vector.
11942
11943   // Blends are faster and handle all the non-lane-crossing cases.
11944   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11945                                                 Zeroable, Subtarget, DAG))
11946     return Blend;
11947
11948   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11949   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11950
11951   // If either input operand is a zero vector, use VPERM2X128 because its mask
11952   // allows us to replace the zero input with an implicit zero.
11953   if (!IsV1Zero && !IsV2Zero) {
11954     // Check for patterns which can be matched with a single insert of a 128-bit
11955     // subvector.
11956     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11957     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11958       // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11959       if (Subtarget.hasAVX2() && V2.isUndef())
11960         return SDValue();
11961
11962       MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11963                                    VT.getVectorNumElements() / 2);
11964       SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11965                                 DAG.getIntPtrConstant(0, DL));
11966       SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11967                                 OnlyUsesV1 ? V1 : V2,
11968                                 DAG.getIntPtrConstant(0, DL));
11969       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11970     }
11971   }
11972
11973   // Otherwise form a 128-bit permutation. After accounting for undefs,
11974   // convert the 64-bit shuffle mask selection values into 128-bit
11975   // selection bits by dividing the indexes by 2 and shifting into positions
11976   // defined by a vperm2*128 instruction's immediate control byte.
11977
11978   // The immediate permute control byte looks like this:
11979   //    [1:0] - select 128 bits from sources for low half of destination
11980   //    [2]   - ignore
11981   //    [3]   - zero low half of destination
11982   //    [5:4] - select 128 bits from sources for high half of destination
11983   //    [6]   - ignore
11984   //    [7]   - zero high half of destination
11985
11986   int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11987   int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11988
11989   unsigned PermMask = MaskLO | (MaskHI << 4);
11990
11991   // If either input is a zero vector, replace it with an undef input.
11992   // Shuffle mask values <  4 are selecting elements of V1.
11993   // Shuffle mask values >= 4 are selecting elements of V2.
11994   // Adjust each half of the permute mask by clearing the half that was
11995   // selecting the zero vector and setting the zero mask bit.
11996   if (IsV1Zero) {
11997     V1 = DAG.getUNDEF(VT);
11998     if (MaskLO < 2)
11999       PermMask = (PermMask & 0xf0) | 0x08;
12000     if (MaskHI < 2)
12001       PermMask = (PermMask & 0x0f) | 0x80;
12002   }
12003   if (IsV2Zero) {
12004     V2 = DAG.getUNDEF(VT);
12005     if (MaskLO >= 2)
12006       PermMask = (PermMask & 0xf0) | 0x08;
12007     if (MaskHI >= 2)
12008       PermMask = (PermMask & 0x0f) | 0x80;
12009   }
12010
12011   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12012                      DAG.getConstant(PermMask, DL, MVT::i8));
12013 }
12014
12015 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12016 /// shuffling each lane.
12017 ///
12018 /// This will only succeed when the result of fixing the 128-bit lanes results
12019 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12020 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12021 /// the lane crosses early and then use simpler shuffles within each lane.
12022 ///
12023 /// FIXME: It might be worthwhile at some point to support this without
12024 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12025 /// in x86 only floating point has interesting non-repeating shuffles, and even
12026 /// those are still *marginally* more expensive.
12027 static SDValue lowerVectorShuffleByMerging128BitLanes(
12028     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12029     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12030   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12031
12032   int Size = Mask.size();
12033   int LaneSize = 128 / VT.getScalarSizeInBits();
12034   int NumLanes = Size / LaneSize;
12035   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12036
12037   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12038   // check whether the in-128-bit lane shuffles share a repeating pattern.
12039   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12040   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12041   for (int i = 0; i < Size; ++i) {
12042     if (Mask[i] < 0)
12043       continue;
12044
12045     int j = i / LaneSize;
12046
12047     if (Lanes[j] < 0) {
12048       // First entry we've seen for this lane.
12049       Lanes[j] = Mask[i] / LaneSize;
12050     } else if (Lanes[j] != Mask[i] / LaneSize) {
12051       // This doesn't match the lane selected previously!
12052       return SDValue();
12053     }
12054
12055     // Check that within each lane we have a consistent shuffle mask.
12056     int k = i % LaneSize;
12057     if (InLaneMask[k] < 0) {
12058       InLaneMask[k] = Mask[i] % LaneSize;
12059     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12060       // This doesn't fit a repeating in-lane mask.
12061       return SDValue();
12062     }
12063   }
12064
12065   // First shuffle the lanes into place.
12066   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12067                                 VT.getSizeInBits() / 64);
12068   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12069   for (int i = 0; i < NumLanes; ++i)
12070     if (Lanes[i] >= 0) {
12071       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12072       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12073     }
12074
12075   V1 = DAG.getBitcast(LaneVT, V1);
12076   V2 = DAG.getBitcast(LaneVT, V2);
12077   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12078
12079   // Cast it back to the type we actually want.
12080   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12081
12082   // Now do a simple shuffle that isn't lane crossing.
12083   SmallVector<int, 8> NewMask((unsigned)Size, -1);
12084   for (int i = 0; i < Size; ++i)
12085     if (Mask[i] >= 0)
12086       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12087   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12088          "Must not introduce lane crosses at this point!");
12089
12090   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12091 }
12092
12093 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12094 /// This allows for fast cases such as subvector extraction/insertion
12095 /// or shuffling smaller vector types which can lower more efficiently.
12096 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12097                                                SDValue V1, SDValue V2,
12098                                                ArrayRef<int> Mask,
12099                                                const X86Subtarget &Subtarget,
12100                                                SelectionDAG &DAG) {
12101   assert(VT.is256BitVector() && "Expected 256-bit vector");
12102
12103   unsigned NumElts = VT.getVectorNumElements();
12104   unsigned HalfNumElts = NumElts / 2;
12105   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12106
12107   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12108   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12109   if (!UndefLower && !UndefUpper)
12110     return SDValue();
12111
12112   // Upper half is undef and lower half is whole upper subvector.
12113   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12114   if (UndefUpper &&
12115       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12116     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12117                              DAG.getIntPtrConstant(HalfNumElts, DL));
12118     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12119                        DAG.getIntPtrConstant(0, DL));
12120   }
12121
12122   // Lower half is undef and upper half is whole lower subvector.
12123   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12124   if (UndefLower &&
12125       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12126     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12127                              DAG.getIntPtrConstant(0, DL));
12128     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12129                        DAG.getIntPtrConstant(HalfNumElts, DL));
12130   }
12131
12132   // If the shuffle only uses two of the four halves of the input operands,
12133   // then extract them and perform the 'half' shuffle at half width.
12134   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12135   int HalfIdx1 = -1, HalfIdx2 = -1;
12136   SmallVector<int, 8> HalfMask(HalfNumElts);
12137   unsigned Offset = UndefLower ? HalfNumElts : 0;
12138   for (unsigned i = 0; i != HalfNumElts; ++i) {
12139     int M = Mask[i + Offset];
12140     if (M < 0) {
12141       HalfMask[i] = M;
12142       continue;
12143     }
12144
12145     // Determine which of the 4 half vectors this element is from.
12146     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12147     int HalfIdx = M / HalfNumElts;
12148
12149     // Determine the element index into its half vector source.
12150     int HalfElt = M % HalfNumElts;
12151
12152     // We can shuffle with up to 2 half vectors, set the new 'half'
12153     // shuffle mask accordingly.
12154     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12155       HalfMask[i] = HalfElt;
12156       HalfIdx1 = HalfIdx;
12157       continue;
12158     }
12159     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12160       HalfMask[i] = HalfElt + HalfNumElts;
12161       HalfIdx2 = HalfIdx;
12162       continue;
12163     }
12164
12165     // Too many half vectors referenced.
12166     return SDValue();
12167   }
12168   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12169
12170   // Only shuffle the halves of the inputs when useful.
12171   int NumLowerHalves =
12172       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12173   int NumUpperHalves =
12174       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12175
12176   // uuuuXXXX - don't extract uppers just to insert again.
12177   if (UndefLower && NumUpperHalves != 0)
12178     return SDValue();
12179
12180   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12181   if (UndefUpper && NumUpperHalves == 2)
12182     return SDValue();
12183
12184   // AVX2 - XXXXuuuu - always extract lowers.
12185   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12186     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12187     if (VT == MVT::v4f64 || VT == MVT::v4i64)
12188       return SDValue();
12189     // AVX2 supports variable 32-bit element cross-lane shuffles.
12190     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12191       // XXXXuuuu - don't extract lowers and uppers.
12192       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12193         return SDValue();
12194     }
12195   }
12196
12197   auto GetHalfVector = [&](int HalfIdx) {
12198     if (HalfIdx < 0)
12199       return DAG.getUNDEF(HalfVT);
12200     SDValue V = (HalfIdx < 2 ? V1 : V2);
12201     HalfIdx = (HalfIdx % 2) * HalfNumElts;
12202     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12203                        DAG.getIntPtrConstant(HalfIdx, DL));
12204   };
12205
12206   SDValue Half1 = GetHalfVector(HalfIdx1);
12207   SDValue Half2 = GetHalfVector(HalfIdx2);
12208   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12209   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12210                      DAG.getIntPtrConstant(Offset, DL));
12211 }
12212
12213 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12214 /// given mask.
12215 ///
12216 /// This returns true if the elements from a particular input are already in the
12217 /// slot required by the given mask and require no permutation.
12218 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12219   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12220   int Size = Mask.size();
12221   for (int i = 0; i < Size; ++i)
12222     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12223       return false;
12224
12225   return true;
12226 }
12227
12228 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12229 /// every lane can be represented as the same repeating mask - allowing us to
12230 /// shuffle the sources with the repeating shuffle and then permute the result
12231 /// to the destination lanes.
12232 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12233     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12234     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12235   int NumElts = VT.getVectorNumElements();
12236   int NumLanes = VT.getSizeInBits() / 128;
12237   int NumLaneElts = NumElts / NumLanes;
12238
12239   // On AVX2 we may be able to just shuffle the lowest elements and then
12240   // broadcast the result.
12241   if (Subtarget.hasAVX2()) {
12242     for (unsigned BroadcastSize : {16, 32, 64}) {
12243       if (BroadcastSize <= VT.getScalarSizeInBits())
12244         continue;
12245       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12246
12247       // Attempt to match a repeating pattern every NumBroadcastElts,
12248       // accounting for UNDEFs but only references the lowest 128-bit
12249       // lane of the inputs.
12250       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12251         for (int i = 0; i != NumElts; i += NumBroadcastElts)
12252           for (int j = 0; j != NumBroadcastElts; ++j) {
12253             int M = Mask[i + j];
12254             if (M < 0)
12255               continue;
12256             int &R = RepeatMask[j];
12257             if (0 != ((M % NumElts) / NumLaneElts))
12258               return false;
12259             if (0 <= R && R != M)
12260               return false;
12261             R = M;
12262           }
12263         return true;
12264       };
12265
12266       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12267       if (!FindRepeatingBroadcastMask(RepeatMask))
12268         continue;
12269
12270       // Shuffle the (lowest) repeated elements in place for broadcast.
12271       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12272
12273       // Shuffle the actual broadcast.
12274       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12275       for (int i = 0; i != NumElts; i += NumBroadcastElts)
12276         for (int j = 0; j != NumBroadcastElts; ++j)
12277           BroadcastMask[i + j] = j;
12278       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12279                                   BroadcastMask);
12280     }
12281   }
12282
12283   // Bail if the shuffle mask doesn't cross 128-bit lanes.
12284   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12285     return SDValue();
12286
12287   // Bail if we already have a repeated lane shuffle mask.
12288   SmallVector<int, 8> RepeatedShuffleMask;
12289   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12290     return SDValue();
12291
12292   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12293   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12294   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12295   int NumSubLanes = NumLanes * SubLaneScale;
12296   int NumSubLaneElts = NumLaneElts / SubLaneScale;
12297
12298   // Check that all the sources are coming from the same lane and see if we can
12299   // form a repeating shuffle mask (local to each sub-lane). At the same time,
12300   // determine the source sub-lane for each destination sub-lane.
12301   int TopSrcSubLane = -1;
12302   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12303   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12304       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12305       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12306
12307   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12308     // Extract the sub-lane mask, check that it all comes from the same lane
12309     // and normalize the mask entries to come from the first lane.
12310     int SrcLane = -1;
12311     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12312     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12313       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12314       if (M < 0)
12315         continue;
12316       int Lane = (M % NumElts) / NumLaneElts;
12317       if ((0 <= SrcLane) && (SrcLane != Lane))
12318         return SDValue();
12319       SrcLane = Lane;
12320       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12321       SubLaneMask[Elt] = LocalM;
12322     }
12323
12324     // Whole sub-lane is UNDEF.
12325     if (SrcLane < 0)
12326       continue;
12327
12328     // Attempt to match against the candidate repeated sub-lane masks.
12329     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12330       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12331         for (int i = 0; i != NumSubLaneElts; ++i) {
12332           if (M1[i] < 0 || M2[i] < 0)
12333             continue;
12334           if (M1[i] != M2[i])
12335             return false;
12336         }
12337         return true;
12338       };
12339
12340       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12341       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12342         continue;
12343
12344       // Merge the sub-lane mask into the matching repeated sub-lane mask.
12345       for (int i = 0; i != NumSubLaneElts; ++i) {
12346         int M = SubLaneMask[i];
12347         if (M < 0)
12348           continue;
12349         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12350                "Unexpected mask element");
12351         RepeatedSubLaneMask[i] = M;
12352       }
12353
12354       // Track the top most source sub-lane - by setting the remaining to UNDEF
12355       // we can greatly simplify shuffle matching.
12356       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12357       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12358       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12359       break;
12360     }
12361
12362     // Bail if we failed to find a matching repeated sub-lane mask.
12363     if (Dst2SrcSubLanes[DstSubLane] < 0)
12364       return SDValue();
12365   }
12366   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12367          "Unexpected source lane");
12368
12369   // Create a repeating shuffle mask for the entire vector.
12370   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12371   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12372     int Lane = SubLane / SubLaneScale;
12373     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12374     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12375       int M = RepeatedSubLaneMask[Elt];
12376       if (M < 0)
12377         continue;
12378       int Idx = (SubLane * NumSubLaneElts) + Elt;
12379       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12380     }
12381   }
12382   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12383
12384   // Shuffle each source sub-lane to its destination.
12385   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12386   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12387     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12388     if (SrcSubLane < 0)
12389       continue;
12390     for (int j = 0; j != NumSubLaneElts; ++j)
12391       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12392   }
12393
12394   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12395                               SubLaneMask);
12396 }
12397
12398 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12399                                          unsigned &ShuffleImm,
12400                                          ArrayRef<int> Mask) {
12401   int NumElts = VT.getVectorNumElements();
12402   assert(VT.getScalarSizeInBits() == 64 &&
12403          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12404          "Unexpected data type for VSHUFPD");
12405
12406   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
12407   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
12408   ShuffleImm = 0;
12409   bool ShufpdMask = true;
12410   bool CommutableMask = true;
12411   for (int i = 0; i < NumElts; ++i) {
12412     if (Mask[i] == SM_SentinelUndef)
12413       continue;
12414     if (Mask[i] < 0)
12415       return false;
12416     int Val = (i & 6) + NumElts * (i & 1);
12417     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12418     if (Mask[i] < Val || Mask[i] > Val + 1)
12419       ShufpdMask = false;
12420     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12421       CommutableMask = false;
12422     ShuffleImm |= (Mask[i] % 2) << i;
12423   }
12424
12425   if (ShufpdMask)
12426     return true;
12427   if (CommutableMask) {
12428     std::swap(V1, V2);
12429     return true;
12430   }
12431
12432   return false;
12433 }
12434
12435 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12436                                             ArrayRef<int> Mask, SDValue V1,
12437                                             SDValue V2, SelectionDAG &DAG) {
12438   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12439          "Unexpected data type for VSHUFPD");
12440
12441   unsigned Immediate = 0;
12442   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12443     return SDValue();
12444
12445   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12446                      DAG.getConstant(Immediate, DL, MVT::i8));
12447 }
12448
12449 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12450                                            ArrayRef<int> Mask, SDValue V1,
12451                                            SDValue V2, SelectionDAG &DAG) {
12452   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12453   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12454
12455   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12456   if (V2.isUndef())
12457     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12458
12459   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12460 }
12461
12462 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12463 ///
12464 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12465 /// isn't available.
12466 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12467                                        const APInt &Zeroable,
12468                                        SDValue V1, SDValue V2,
12469                                        const X86Subtarget &Subtarget,
12470                                        SelectionDAG &DAG) {
12471   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12472   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12473   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12474
12475   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12476                                            Zeroable, Subtarget, DAG))
12477     return V;
12478
12479   if (V2.isUndef()) {
12480     // Check for being able to broadcast a single element.
12481     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12482             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12483       return Broadcast;
12484
12485     // Use low duplicate instructions for masks that match their pattern.
12486     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12487       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12488
12489     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12490       // Non-half-crossing single input shuffles can be lowered with an
12491       // interleaved permutation.
12492       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12493                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12494       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12495                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12496     }
12497
12498     // With AVX2 we have direct support for this permutation.
12499     if (Subtarget.hasAVX2())
12500       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12501                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12502
12503     // Try to create an in-lane repeating shuffle mask and then shuffle the
12504     // the results into the target lanes.
12505     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12506             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12507       return V;
12508
12509     // Otherwise, fall back.
12510     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12511                                                    DAG);
12512   }
12513
12514   // Use dedicated unpack instructions for masks that match their pattern.
12515   if (SDValue V =
12516           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12517     return V;
12518
12519   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12520                                                 Zeroable, Subtarget, DAG))
12521     return Blend;
12522
12523   // Check if the blend happens to exactly fit that of SHUFPD.
12524   if (SDValue Op =
12525       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12526     return Op;
12527
12528   // Try to create an in-lane repeating shuffle mask and then shuffle the
12529   // the results into the target lanes.
12530   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12531           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12532     return V;
12533
12534   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12535   // shuffle. However, if we have AVX2 and either inputs are already in place,
12536   // we will be able to shuffle even across lanes the other input in a single
12537   // instruction so skip this pattern.
12538   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12539                                 isShuffleMaskInputInPlace(1, Mask))))
12540     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12541             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12542       return Result;
12543   // If we have VLX support, we can use VEXPAND.
12544   if (Subtarget.hasVLX())
12545     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12546                                                V1, V2, DAG, Subtarget))
12547       return V;
12548
12549   // If we have AVX2 then we always want to lower with a blend because an v4 we
12550   // can fully permute the elements.
12551   if (Subtarget.hasAVX2())
12552     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12553                                                       Mask, DAG);
12554
12555   // Otherwise fall back on generic lowering.
12556   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12557 }
12558
12559 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12560 ///
12561 /// This routine is only called when we have AVX2 and thus a reasonable
12562 /// instruction set for v4i64 shuffling..
12563 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12564                                        const APInt &Zeroable,
12565                                        SDValue V1, SDValue V2,
12566                                        const X86Subtarget &Subtarget,
12567                                        SelectionDAG &DAG) {
12568   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12569   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12570   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12571   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12572
12573   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12574                                            Zeroable, Subtarget, DAG))
12575     return V;
12576
12577   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12578                                                 Zeroable, Subtarget, DAG))
12579     return Blend;
12580
12581   // Check for being able to broadcast a single element.
12582   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12583                                                         Mask, Subtarget, DAG))
12584     return Broadcast;
12585
12586   if (V2.isUndef()) {
12587     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12588     // can use lower latency instructions that will operate on both lanes.
12589     SmallVector<int, 2> RepeatedMask;
12590     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12591       SmallVector<int, 4> PSHUFDMask;
12592       scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12593       return DAG.getBitcast(
12594           MVT::v4i64,
12595           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12596                       DAG.getBitcast(MVT::v8i32, V1),
12597                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12598     }
12599
12600     // AVX2 provides a direct instruction for permuting a single input across
12601     // lanes.
12602     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12603                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12604   }
12605
12606   // Try to use shift instructions.
12607   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12608                                                 Zeroable, Subtarget, DAG))
12609     return Shift;
12610
12611   // If we have VLX support, we can use VALIGN or VEXPAND.
12612   if (Subtarget.hasVLX()) {
12613     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12614                                                     Mask, Subtarget, DAG))
12615       return Rotate;
12616
12617     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12618                                                V1, V2, DAG, Subtarget))
12619       return V;
12620   }
12621
12622   // Try to use PALIGNR.
12623   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12624                                                       Mask, Subtarget, DAG))
12625     return Rotate;
12626
12627   // Use dedicated unpack instructions for masks that match their pattern.
12628   if (SDValue V =
12629           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12630     return V;
12631
12632   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12633   // shuffle. However, if we have AVX2 and either inputs are already in place,
12634   // we will be able to shuffle even across lanes the other input in a single
12635   // instruction so skip this pattern.
12636   if (!isShuffleMaskInputInPlace(0, Mask) &&
12637       !isShuffleMaskInputInPlace(1, Mask))
12638     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12639             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12640       return Result;
12641
12642   // Otherwise fall back on generic blend lowering.
12643   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12644                                                     Mask, DAG);
12645 }
12646
12647 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12648 ///
12649 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12650 /// isn't available.
12651 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12652                                        const APInt &Zeroable,
12653                                        SDValue V1, SDValue V2,
12654                                        const X86Subtarget &Subtarget,
12655                                        SelectionDAG &DAG) {
12656   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12657   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12658   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12659
12660   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12661                                                 Zeroable, Subtarget, DAG))
12662     return Blend;
12663
12664   // Check for being able to broadcast a single element.
12665   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12666                                                         Mask, Subtarget, DAG))
12667     return Broadcast;
12668
12669   // If the shuffle mask is repeated in each 128-bit lane, we have many more
12670   // options to efficiently lower the shuffle.
12671   SmallVector<int, 4> RepeatedMask;
12672   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12673     assert(RepeatedMask.size() == 4 &&
12674            "Repeated masks must be half the mask width!");
12675
12676     // Use even/odd duplicate instructions for masks that match their pattern.
12677     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12678       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12679     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12680       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12681
12682     if (V2.isUndef())
12683       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12684                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12685
12686     // Use dedicated unpack instructions for masks that match their pattern.
12687     if (SDValue V =
12688             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12689       return V;
12690
12691     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12692     // have already handled any direct blends.
12693     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12694   }
12695
12696   // Try to create an in-lane repeating shuffle mask and then shuffle the
12697   // the results into the target lanes.
12698   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12699           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12700     return V;
12701
12702   // If we have a single input shuffle with different shuffle patterns in the
12703   // two 128-bit lanes use the variable mask to VPERMILPS.
12704   if (V2.isUndef()) {
12705     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12706     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12707       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12708
12709     if (Subtarget.hasAVX2())
12710       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12711
12712     // Otherwise, fall back.
12713     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12714                                                    DAG);
12715   }
12716
12717   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12718   // shuffle.
12719   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12720           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12721     return Result;
12722   // If we have VLX support, we can use VEXPAND.
12723   if (Subtarget.hasVLX())
12724     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12725                                                V1, V2, DAG, Subtarget))
12726       return V;
12727
12728   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12729   // since after split we get a more efficient code using vpunpcklwd and
12730   // vpunpckhwd instrs than vblend.
12731   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12732     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12733                                                      Mask, DAG))
12734       return V;
12735
12736   // If we have AVX2 then we always want to lower with a blend because at v8 we
12737   // can fully permute the elements.
12738   if (Subtarget.hasAVX2())
12739     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12740                                                       Mask, DAG);
12741
12742   // Otherwise fall back on generic lowering.
12743   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12744 }
12745
12746 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12747 ///
12748 /// This routine is only called when we have AVX2 and thus a reasonable
12749 /// instruction set for v8i32 shuffling..
12750 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12751                                        const APInt &Zeroable,
12752                                        SDValue V1, SDValue V2,
12753                                        const X86Subtarget &Subtarget,
12754                                        SelectionDAG &DAG) {
12755   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12756   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12757   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12758   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12759
12760   // Whenever we can lower this as a zext, that instruction is strictly faster
12761   // than any alternative. It also allows us to fold memory operands into the
12762   // shuffle in many cases.
12763   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12764           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12765     return ZExt;
12766
12767   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12768   // since after split we get a more efficient code than vblend by using
12769   // vpunpcklwd and vpunpckhwd instrs.
12770   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12771       !Subtarget.hasAVX512())
12772     if (SDValue V =
12773             lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12774       return V;
12775
12776   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12777                                                 Zeroable, Subtarget, DAG))
12778     return Blend;
12779
12780   // Check for being able to broadcast a single element.
12781   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12782                                                         Mask, Subtarget, DAG))
12783     return Broadcast;
12784
12785   // If the shuffle mask is repeated in each 128-bit lane we can use more
12786   // efficient instructions that mirror the shuffles across the two 128-bit
12787   // lanes.
12788   SmallVector<int, 4> RepeatedMask;
12789   bool Is128BitLaneRepeatedShuffle =
12790       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12791   if (Is128BitLaneRepeatedShuffle) {
12792     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12793     if (V2.isUndef())
12794       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12795                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12796
12797     // Use dedicated unpack instructions for masks that match their pattern.
12798     if (SDValue V =
12799             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12800       return V;
12801   }
12802
12803   // Try to use shift instructions.
12804   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12805                                                 Zeroable, Subtarget, DAG))
12806     return Shift;
12807
12808   // If we have VLX support, we can use VALIGN or EXPAND.
12809   if (Subtarget.hasVLX()) {
12810     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12811                                                     Mask, Subtarget, DAG))
12812       return Rotate;
12813
12814     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12815                                                V1, V2, DAG, Subtarget))
12816       return V;
12817   }
12818
12819   // Try to use byte rotation instructions.
12820   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12821           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12822     return Rotate;
12823
12824   // Try to create an in-lane repeating shuffle mask and then shuffle the
12825   // results into the target lanes.
12826   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12827           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12828     return V;
12829
12830   // If the shuffle patterns aren't repeated but it is a single input, directly
12831   // generate a cross-lane VPERMD instruction.
12832   if (V2.isUndef()) {
12833     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12834     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12835   }
12836
12837   // Assume that a single SHUFPS is faster than an alternative sequence of
12838   // multiple instructions (even if the CPU has a domain penalty).
12839   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12840   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12841     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12842     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12843     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12844                                                   CastV1, CastV2, DAG);
12845     return DAG.getBitcast(MVT::v8i32, ShufPS);
12846   }
12847
12848   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12849   // shuffle.
12850   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12851           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12852     return Result;
12853
12854   // Otherwise fall back on generic blend lowering.
12855   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12856                                                     Mask, DAG);
12857 }
12858
12859 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12860 ///
12861 /// This routine is only called when we have AVX2 and thus a reasonable
12862 /// instruction set for v16i16 shuffling..
12863 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12864                                         const APInt &Zeroable,
12865                                         SDValue V1, SDValue V2,
12866                                         const X86Subtarget &Subtarget,
12867                                         SelectionDAG &DAG) {
12868   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12869   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12870   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12871   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12872
12873   // Whenever we can lower this as a zext, that instruction is strictly faster
12874   // than any alternative. It also allows us to fold memory operands into the
12875   // shuffle in many cases.
12876   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12877           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12878     return ZExt;
12879
12880   // Check for being able to broadcast a single element.
12881   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12882                                                         Mask, Subtarget, DAG))
12883     return Broadcast;
12884
12885   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12886                                                 Zeroable, Subtarget, DAG))
12887     return Blend;
12888
12889   // Use dedicated unpack instructions for masks that match their pattern.
12890   if (SDValue V =
12891           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12892     return V;
12893
12894   // Try to use shift instructions.
12895   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12896                                                 Zeroable, Subtarget, DAG))
12897     return Shift;
12898
12899   // Try to use byte rotation instructions.
12900   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12901           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12902     return Rotate;
12903
12904   // Try to create an in-lane repeating shuffle mask and then shuffle the
12905   // the results into the target lanes.
12906   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12907           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12908     return V;
12909
12910   if (V2.isUndef()) {
12911     // There are no generalized cross-lane shuffle operations available on i16
12912     // element types.
12913     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12914       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12915                                                      Mask, DAG);
12916
12917     SmallVector<int, 8> RepeatedMask;
12918     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12919       // As this is a single-input shuffle, the repeated mask should be
12920       // a strictly valid v8i16 mask that we can pass through to the v8i16
12921       // lowering to handle even the v16 case.
12922       return lowerV8I16GeneralSingleInputVectorShuffle(
12923           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12924     }
12925   }
12926
12927   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12928           DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12929     return PSHUFB;
12930
12931   // AVX512BWVL can lower to VPERMW.
12932   if (Subtarget.hasBWI() && Subtarget.hasVLX())
12933     return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12934
12935   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12936   // shuffle.
12937   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12938           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12939     return Result;
12940
12941   // Otherwise fall back on generic lowering.
12942   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12943 }
12944
12945 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12946 ///
12947 /// This routine is only called when we have AVX2 and thus a reasonable
12948 /// instruction set for v32i8 shuffling..
12949 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12950                                        const APInt &Zeroable,
12951                                        SDValue V1, SDValue V2,
12952                                        const X86Subtarget &Subtarget,
12953                                        SelectionDAG &DAG) {
12954   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12955   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12956   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12957   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12958
12959   // Whenever we can lower this as a zext, that instruction is strictly faster
12960   // than any alternative. It also allows us to fold memory operands into the
12961   // shuffle in many cases.
12962   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12963           DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12964     return ZExt;
12965
12966   // Check for being able to broadcast a single element.
12967   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12968                                                         Mask, Subtarget, DAG))
12969     return Broadcast;
12970
12971   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12972                                                 Zeroable, Subtarget, DAG))
12973     return Blend;
12974
12975   // Use dedicated unpack instructions for masks that match their pattern.
12976   if (SDValue V =
12977           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12978     return V;
12979
12980   // Try to use shift instructions.
12981   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12982                                                 Zeroable, Subtarget, DAG))
12983     return Shift;
12984
12985   // Try to use byte rotation instructions.
12986   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12987           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12988     return Rotate;
12989
12990   // Try to create an in-lane repeating shuffle mask and then shuffle the
12991   // the results into the target lanes.
12992   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12993           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12994     return V;
12995
12996   // There are no generalized cross-lane shuffle operations available on i8
12997   // element types.
12998   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12999     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13000                                                    DAG);
13001
13002   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13003           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13004     return PSHUFB;
13005
13006   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13007   // shuffle.
13008   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13009           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13010     return Result;
13011
13012   // Otherwise fall back on generic lowering.
13013   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13014 }
13015
13016 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13017 ///
13018 /// This routine either breaks down the specific type of a 256-bit x86 vector
13019 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13020 /// together based on the available instructions.
13021 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13022                                         MVT VT, SDValue V1, SDValue V2,
13023                                         const APInt &Zeroable,
13024                                         const X86Subtarget &Subtarget,
13025                                         SelectionDAG &DAG) {
13026   // If we have a single input to the zero element, insert that into V1 if we
13027   // can do so cheaply.
13028   int NumElts = VT.getVectorNumElements();
13029   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13030
13031   if (NumV2Elements == 1 && Mask[0] >= NumElts)
13032     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13033             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13034       return Insertion;
13035
13036   // Handle special cases where the lower or upper half is UNDEF.
13037   if (SDValue V =
13038           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13039     return V;
13040
13041   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13042   // can check for those subtargets here and avoid much of the subtarget
13043   // querying in the per-vector-type lowering routines. With AVX1 we have
13044   // essentially *zero* ability to manipulate a 256-bit vector with integer
13045   // types. Since we'll use floating point types there eventually, just
13046   // immediately cast everything to a float and operate entirely in that domain.
13047   if (VT.isInteger() && !Subtarget.hasAVX2()) {
13048     int ElementBits = VT.getScalarSizeInBits();
13049     if (ElementBits < 32) {
13050       // No floating point type available, if we can't use the bit operations
13051       // for masking/blending then decompose into 128-bit vectors.
13052       if (SDValue V =
13053               lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13054         return V;
13055       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13056         return V;
13057       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13058     }
13059
13060     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13061                                 VT.getVectorNumElements());
13062     V1 = DAG.getBitcast(FpVT, V1);
13063     V2 = DAG.getBitcast(FpVT, V2);
13064     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13065   }
13066
13067   switch (VT.SimpleTy) {
13068   case MVT::v4f64:
13069     return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13070   case MVT::v4i64:
13071     return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13072   case MVT::v8f32:
13073     return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13074   case MVT::v8i32:
13075     return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13076   case MVT::v16i16:
13077     return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13078   case MVT::v32i8:
13079     return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13080
13081   default:
13082     llvm_unreachable("Not a valid 256-bit x86 vector type!");
13083   }
13084 }
13085
13086 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13087 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13088                                         ArrayRef<int> Mask, SDValue V1,
13089                                         SDValue V2, SelectionDAG &DAG) {
13090   assert(VT.getScalarSizeInBits() == 64 &&
13091          "Unexpected element type size for 128bit shuffle.");
13092
13093   // To handle 256 bit vector requires VLX and most probably
13094   // function lowerV2X128VectorShuffle() is better solution.
13095   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13096
13097   SmallVector<int, 4> WidenedMask;
13098   if (!canWidenShuffleElements(Mask, WidenedMask))
13099     return SDValue();
13100
13101   // Check for patterns which can be matched with a single insert of a 256-bit
13102   // subvector.
13103   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13104                                         {0, 1, 2, 3, 0, 1, 2, 3});
13105   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13106                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
13107     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13108     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13109                               DAG.getIntPtrConstant(0, DL));
13110     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13111                               OnlyUsesV1 ? V1 : V2,
13112                               DAG.getIntPtrConstant(0, DL));
13113     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13114   }
13115
13116   assert(WidenedMask.size() == 4);
13117
13118   // See if this is an insertion of the lower 128-bits of V2 into V1.
13119   bool IsInsert = true;
13120   int V2Index = -1;
13121   for (int i = 0; i < 4; ++i) {
13122     assert(WidenedMask[i] >= -1);
13123     if (WidenedMask[i] < 0)
13124       continue;
13125
13126     // Make sure all V1 subvectors are in place.
13127     if (WidenedMask[i] < 4) {
13128       if (WidenedMask[i] != i) {
13129         IsInsert = false;
13130         break;
13131       }
13132     } else {
13133       // Make sure we only have a single V2 index and its the lowest 128-bits.
13134       if (V2Index >= 0 || WidenedMask[i] != 4) {
13135         IsInsert = false;
13136         break;
13137       }
13138       V2Index = i;
13139     }
13140   }
13141   if (IsInsert && V2Index >= 0) {
13142     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13143     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13144                                  DAG.getIntPtrConstant(0, DL));
13145     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13146   }
13147
13148   // Try to lower to to vshuf64x2/vshuf32x4.
13149   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13150   unsigned PermMask = 0;
13151   // Insure elements came from the same Op.
13152   for (int i = 0; i < 4; ++i) {
13153     assert(WidenedMask[i] >= -1);
13154     if (WidenedMask[i] < 0)
13155       continue;
13156
13157     SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13158     unsigned OpIndex = i / 2;
13159     if (Ops[OpIndex].isUndef())
13160       Ops[OpIndex] = Op;
13161     else if (Ops[OpIndex] != Op)
13162       return SDValue();
13163
13164     // Convert the 128-bit shuffle mask selection values into 128-bit selection
13165     // bits defined by a vshuf64x2 instruction's immediate control byte.
13166     PermMask |= (WidenedMask[i] % 4) << (i * 2);
13167   }
13168
13169   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13170                      DAG.getConstant(PermMask, DL, MVT::i8));
13171 }
13172
13173 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13174 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13175                                        const APInt &Zeroable,
13176                                        SDValue V1, SDValue V2,
13177                                        const X86Subtarget &Subtarget,
13178                                        SelectionDAG &DAG) {
13179   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13180   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13181   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13182
13183   if (V2.isUndef()) {
13184     // Use low duplicate instructions for masks that match their pattern.
13185     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13186       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13187
13188     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13189       // Non-half-crossing single input shuffles can be lowered with an
13190       // interleaved permutation.
13191       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13192                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13193                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13194                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13195       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13196                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13197     }
13198
13199     SmallVector<int, 4> RepeatedMask;
13200     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13201       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13202                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13203   }
13204
13205   if (SDValue Shuf128 =
13206           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13207     return Shuf128;
13208
13209   if (SDValue Unpck =
13210           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13211     return Unpck;
13212
13213   // Check if the blend happens to exactly fit that of SHUFPD.
13214   if (SDValue Op =
13215       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13216     return Op;
13217
13218   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13219                                              V2, DAG, Subtarget))
13220     return V;
13221
13222   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13223                                                 Zeroable, Subtarget, DAG))
13224     return Blend;
13225
13226   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13227 }
13228
13229 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13230 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13231                                         const APInt &Zeroable,
13232                                         SDValue V1, SDValue V2,
13233                                         const X86Subtarget &Subtarget,
13234                                         SelectionDAG &DAG) {
13235   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13236   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13237   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13238
13239   // If the shuffle mask is repeated in each 128-bit lane, we have many more
13240   // options to efficiently lower the shuffle.
13241   SmallVector<int, 4> RepeatedMask;
13242   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13243     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13244
13245     // Use even/odd duplicate instructions for masks that match their pattern.
13246     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13247       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13248     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13249       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13250
13251     if (V2.isUndef())
13252       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13253                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13254
13255     // Use dedicated unpack instructions for masks that match their pattern.
13256     if (SDValue Unpck =
13257             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13258       return Unpck;
13259
13260     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13261                                                   Zeroable, Subtarget, DAG))
13262       return Blend;
13263
13264     // Otherwise, fall back to a SHUFPS sequence.
13265     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13266   }
13267   // If we have AVX512F support, we can use VEXPAND.
13268   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13269                                              V1, V2, DAG, Subtarget))
13270     return V;
13271
13272   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13273 }
13274
13275 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13276 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13277                                        const APInt &Zeroable,
13278                                        SDValue V1, SDValue V2,
13279                                        const X86Subtarget &Subtarget,
13280                                        SelectionDAG &DAG) {
13281   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13282   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13283   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13284
13285   if (SDValue Shuf128 =
13286           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13287     return Shuf128;
13288
13289   if (V2.isUndef()) {
13290     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13291     // can use lower latency instructions that will operate on all four
13292     // 128-bit lanes.
13293     SmallVector<int, 2> Repeated128Mask;
13294     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13295       SmallVector<int, 4> PSHUFDMask;
13296       scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13297       return DAG.getBitcast(
13298           MVT::v8i64,
13299           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13300                       DAG.getBitcast(MVT::v16i32, V1),
13301                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13302     }
13303
13304     SmallVector<int, 4> Repeated256Mask;
13305     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13306       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13307                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13308   }
13309
13310   // Try to use shift instructions.
13311   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13312                                                 Zeroable, Subtarget, DAG))
13313     return Shift;
13314
13315   // Try to use VALIGN.
13316   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13317                                                   Mask, Subtarget, DAG))
13318     return Rotate;
13319
13320   // Try to use PALIGNR.
13321   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13322                                                       Mask, Subtarget, DAG))
13323     return Rotate;
13324
13325   if (SDValue Unpck =
13326           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13327     return Unpck;
13328   // If we have AVX512F support, we can use VEXPAND.
13329   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13330                                              V2, DAG, Subtarget))
13331     return V;
13332
13333   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13334                                                 Zeroable, Subtarget, DAG))
13335     return Blend;
13336
13337   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13338 }
13339
13340 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13341 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13342                                         const APInt &Zeroable,
13343                                         SDValue V1, SDValue V2,
13344                                         const X86Subtarget &Subtarget,
13345                                         SelectionDAG &DAG) {
13346   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13347   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13348   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13349
13350   // Whenever we can lower this as a zext, that instruction is strictly faster
13351   // than any alternative. It also allows us to fold memory operands into the
13352   // shuffle in many cases.
13353   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13354           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13355     return ZExt;
13356
13357   // If the shuffle mask is repeated in each 128-bit lane we can use more
13358   // efficient instructions that mirror the shuffles across the four 128-bit
13359   // lanes.
13360   SmallVector<int, 4> RepeatedMask;
13361   bool Is128BitLaneRepeatedShuffle =
13362       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13363   if (Is128BitLaneRepeatedShuffle) {
13364     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13365     if (V2.isUndef())
13366       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13367                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13368
13369     // Use dedicated unpack instructions for masks that match their pattern.
13370     if (SDValue V =
13371             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13372       return V;
13373   }
13374
13375   // Try to use shift instructions.
13376   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13377                                                 Zeroable, Subtarget, DAG))
13378     return Shift;
13379
13380   // Try to use VALIGN.
13381   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13382                                                   Mask, Subtarget, DAG))
13383     return Rotate;
13384
13385   // Try to use byte rotation instructions.
13386   if (Subtarget.hasBWI())
13387     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13388             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13389       return Rotate;
13390
13391   // Assume that a single SHUFPS is faster than using a permv shuffle.
13392   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13393   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13394     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13395     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13396     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13397                                                   CastV1, CastV2, DAG);
13398     return DAG.getBitcast(MVT::v16i32, ShufPS);
13399   }
13400   // If we have AVX512F support, we can use VEXPAND.
13401   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13402                                              V1, V2, DAG, Subtarget))
13403     return V;
13404
13405   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13406                                                 Zeroable, Subtarget, DAG))
13407     return Blend;
13408   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13409 }
13410
13411 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13412 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13413                                         const APInt &Zeroable,
13414                                         SDValue V1, SDValue V2,
13415                                         const X86Subtarget &Subtarget,
13416                                         SelectionDAG &DAG) {
13417   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13418   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13419   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13420   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13421
13422   // Whenever we can lower this as a zext, that instruction is strictly faster
13423   // than any alternative. It also allows us to fold memory operands into the
13424   // shuffle in many cases.
13425   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13426           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13427     return ZExt;
13428
13429   // Use dedicated unpack instructions for masks that match their pattern.
13430   if (SDValue V =
13431           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13432     return V;
13433
13434   // Try to use shift instructions.
13435   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13436                                                 Zeroable, Subtarget, DAG))
13437     return Shift;
13438
13439   // Try to use byte rotation instructions.
13440   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13441           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13442     return Rotate;
13443
13444   if (V2.isUndef()) {
13445     SmallVector<int, 8> RepeatedMask;
13446     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13447       // As this is a single-input shuffle, the repeated mask should be
13448       // a strictly valid v8i16 mask that we can pass through to the v8i16
13449       // lowering to handle even the v32 case.
13450       return lowerV8I16GeneralSingleInputVectorShuffle(
13451           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13452     }
13453   }
13454
13455   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13456                                                 Zeroable, Subtarget, DAG))
13457     return Blend;
13458
13459   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13460 }
13461
13462 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13463 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13464                                        const APInt &Zeroable,
13465                                        SDValue V1, SDValue V2,
13466                                        const X86Subtarget &Subtarget,
13467                                        SelectionDAG &DAG) {
13468   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13469   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13470   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13471   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13472
13473   // Whenever we can lower this as a zext, that instruction is strictly faster
13474   // than any alternative. It also allows us to fold memory operands into the
13475   // shuffle in many cases.
13476   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13477           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13478     return ZExt;
13479
13480   // Use dedicated unpack instructions for masks that match their pattern.
13481   if (SDValue V =
13482           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13483     return V;
13484
13485   // Try to use shift instructions.
13486   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13487                                                 Zeroable, Subtarget, DAG))
13488     return Shift;
13489
13490   // Try to use byte rotation instructions.
13491   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13492           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13493     return Rotate;
13494
13495   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13496           DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13497     return PSHUFB;
13498
13499   // VBMI can use VPERMV/VPERMV3 byte shuffles.
13500   if (Subtarget.hasVBMI())
13501     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13502
13503   // Try to create an in-lane repeating shuffle mask and then shuffle the
13504   // the results into the target lanes.
13505   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13506           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13507     return V;
13508
13509   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13510                                                 Zeroable, Subtarget, DAG))
13511     return Blend;
13512
13513   // FIXME: Implement direct support for this type!
13514   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13515 }
13516
13517 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13518 ///
13519 /// This routine either breaks down the specific type of a 512-bit x86 vector
13520 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13521 /// together based on the available instructions.
13522 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13523                                         MVT VT, SDValue V1, SDValue V2,
13524                                         const APInt &Zeroable,
13525                                         const X86Subtarget &Subtarget,
13526                                         SelectionDAG &DAG) {
13527   assert(Subtarget.hasAVX512() &&
13528          "Cannot lower 512-bit vectors w/ basic ISA!");
13529
13530   // If we have a single input to the zero element, insert that into V1 if we
13531   // can do so cheaply.
13532   int NumElts = Mask.size();
13533   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13534
13535   if (NumV2Elements == 1 && Mask[0] >= NumElts)
13536     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13537             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13538       return Insertion;
13539
13540   // Check for being able to broadcast a single element.
13541   if (SDValue Broadcast =
13542           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13543     return Broadcast;
13544
13545   // Dispatch to each element type for lowering. If we don't have support for
13546   // specific element type shuffles at 512 bits, immediately split them and
13547   // lower them. Each lowering routine of a given type is allowed to assume that
13548   // the requisite ISA extensions for that element type are available.
13549   switch (VT.SimpleTy) {
13550   case MVT::v8f64:
13551     return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13552   case MVT::v16f32:
13553     return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13554   case MVT::v8i64:
13555     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13556   case MVT::v16i32:
13557     return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13558   case MVT::v32i16:
13559     return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13560   case MVT::v64i8:
13561     return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13562
13563   default:
13564     llvm_unreachable("Not a valid 512-bit x86 vector type!");
13565   }
13566 }
13567
13568 // Lower vXi1 vector shuffles.
13569 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13570 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13571 // vector, shuffle and then truncate it back.
13572 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13573                                       MVT VT, SDValue V1, SDValue V2,
13574                                       const X86Subtarget &Subtarget,
13575                                       SelectionDAG &DAG) {
13576   assert(Subtarget.hasAVX512() &&
13577          "Cannot lower 512-bit vectors w/o basic ISA!");
13578   MVT ExtVT;
13579   switch (VT.SimpleTy) {
13580   default:
13581     llvm_unreachable("Expected a vector of i1 elements");
13582   case MVT::v2i1:
13583     ExtVT = MVT::v2i64;
13584     break;
13585   case MVT::v4i1:
13586     ExtVT = MVT::v4i32;
13587     break;
13588   case MVT::v8i1:
13589     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13590     break;
13591   case MVT::v16i1:
13592     ExtVT = MVT::v16i32;
13593     break;
13594   case MVT::v32i1:
13595     ExtVT = MVT::v32i16;
13596     break;
13597   case MVT::v64i1:
13598     ExtVT = MVT::v64i8;
13599     break;
13600   }
13601
13602   if (ISD::isBuildVectorAllZeros(V1.getNode()))
13603     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13604   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13605     V1 = getOnesVector(ExtVT, DAG, DL);
13606   else
13607     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13608
13609   if (V2.isUndef())
13610     V2 = DAG.getUNDEF(ExtVT);
13611   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13612     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13613   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13614     V2 = getOnesVector(ExtVT, DAG, DL);
13615   else
13616     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13617
13618   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13619   // i1 was sign extended we can use X86ISD::CVT2MASK.
13620   int NumElems = VT.getVectorNumElements();
13621   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13622       (Subtarget.hasDQI() && (NumElems < 32)))
13623     return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13624
13625   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13626 }
13627
13628 /// Helper function that returns true if the shuffle mask should be
13629 /// commuted to improve canonicalization.
13630 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13631   int NumElements = Mask.size();
13632
13633   int NumV1Elements = 0, NumV2Elements = 0;
13634   for (int M : Mask)
13635     if (M < 0)
13636       continue;
13637     else if (M < NumElements)
13638       ++NumV1Elements;
13639     else
13640       ++NumV2Elements;
13641
13642   // Commute the shuffle as needed such that more elements come from V1 than
13643   // V2. This allows us to match the shuffle pattern strictly on how many
13644   // elements come from V1 without handling the symmetric cases.
13645   if (NumV2Elements > NumV1Elements)
13646     return true;
13647
13648   assert(NumV1Elements > 0 && "No V1 indices");
13649
13650   if (NumV2Elements == 0)
13651     return false;
13652
13653   // When the number of V1 and V2 elements are the same, try to minimize the
13654   // number of uses of V2 in the low half of the vector. When that is tied,
13655   // ensure that the sum of indices for V1 is equal to or lower than the sum
13656   // indices for V2. When those are equal, try to ensure that the number of odd
13657   // indices for V1 is lower than the number of odd indices for V2.
13658   if (NumV1Elements == NumV2Elements) {
13659     int LowV1Elements = 0, LowV2Elements = 0;
13660     for (int M : Mask.slice(0, NumElements / 2))
13661       if (M >= NumElements)
13662         ++LowV2Elements;
13663       else if (M >= 0)
13664         ++LowV1Elements;
13665     if (LowV2Elements > LowV1Elements)
13666       return true;
13667     if (LowV2Elements == LowV1Elements) {
13668       int SumV1Indices = 0, SumV2Indices = 0;
13669       for (int i = 0, Size = Mask.size(); i < Size; ++i)
13670         if (Mask[i] >= NumElements)
13671           SumV2Indices += i;
13672         else if (Mask[i] >= 0)
13673           SumV1Indices += i;
13674       if (SumV2Indices < SumV1Indices)
13675         return true;
13676       if (SumV2Indices == SumV1Indices) {
13677         int NumV1OddIndices = 0, NumV2OddIndices = 0;
13678         for (int i = 0, Size = Mask.size(); i < Size; ++i)
13679           if (Mask[i] >= NumElements)
13680             NumV2OddIndices += i % 2;
13681           else if (Mask[i] >= 0)
13682             NumV1OddIndices += i % 2;
13683         if (NumV2OddIndices < NumV1OddIndices)
13684           return true;
13685       }
13686     }
13687   }
13688
13689   return false;
13690 }
13691
13692 /// \brief Top-level lowering for x86 vector shuffles.
13693 ///
13694 /// This handles decomposition, canonicalization, and lowering of all x86
13695 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13696 /// above in helper routines. The canonicalization attempts to widen shuffles
13697 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13698 /// s.t. only one of the two inputs needs to be tested, etc.
13699 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13700                                   SelectionDAG &DAG) {
13701   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13702   ArrayRef<int> Mask = SVOp->getMask();
13703   SDValue V1 = Op.getOperand(0);
13704   SDValue V2 = Op.getOperand(1);
13705   MVT VT = Op.getSimpleValueType();
13706   int NumElements = VT.getVectorNumElements();
13707   SDLoc DL(Op);
13708   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13709
13710   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13711          "Can't lower MMX shuffles");
13712
13713   bool V1IsUndef = V1.isUndef();
13714   bool V2IsUndef = V2.isUndef();
13715   if (V1IsUndef && V2IsUndef)
13716     return DAG.getUNDEF(VT);
13717
13718   // When we create a shuffle node we put the UNDEF node to second operand,
13719   // but in some cases the first operand may be transformed to UNDEF.
13720   // In this case we should just commute the node.
13721   if (V1IsUndef)
13722     return DAG.getCommutedVectorShuffle(*SVOp);
13723
13724   // Check for non-undef masks pointing at an undef vector and make the masks
13725   // undef as well. This makes it easier to match the shuffle based solely on
13726   // the mask.
13727   if (V2IsUndef)
13728     for (int M : Mask)
13729       if (M >= NumElements) {
13730         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13731         for (int &M : NewMask)
13732           if (M >= NumElements)
13733             M = -1;
13734         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13735       }
13736
13737   // Check for illegal shuffle mask element index values.
13738   int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13739   assert(llvm::all_of(Mask,
13740                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13741          "Out of bounds shuffle index");
13742
13743   // We actually see shuffles that are entirely re-arrangements of a set of
13744   // zero inputs. This mostly happens while decomposing complex shuffles into
13745   // simple ones. Directly lower these as a buildvector of zeros.
13746   APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13747   if (Zeroable.isAllOnesValue())
13748     return getZeroVector(VT, Subtarget, DAG, DL);
13749
13750   // Try to collapse shuffles into using a vector type with fewer elements but
13751   // wider element types. We cap this to not form integers or floating point
13752   // elements wider than 64 bits, but it might be interesting to form i128
13753   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13754   SmallVector<int, 16> WidenedMask;
13755   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13756       canWidenShuffleElements(Mask, WidenedMask)) {
13757     MVT NewEltVT = VT.isFloatingPoint()
13758                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13759                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13760     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13761     // Make sure that the new vector type is legal. For example, v2f64 isn't
13762     // legal on SSE1.
13763     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13764       V1 = DAG.getBitcast(NewVT, V1);
13765       V2 = DAG.getBitcast(NewVT, V2);
13766       return DAG.getBitcast(
13767           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13768     }
13769   }
13770
13771   // Commute the shuffle if it will improve canonicalization.
13772   if (canonicalizeShuffleMaskWithCommute(Mask))
13773     return DAG.getCommutedVectorShuffle(*SVOp);
13774
13775   // For each vector width, delegate to a specialized lowering routine.
13776   if (VT.is128BitVector())
13777     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13778                                     DAG);
13779
13780   if (VT.is256BitVector())
13781     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13782                                     DAG);
13783
13784   if (VT.is512BitVector())
13785     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13786                                     DAG);
13787
13788   if (Is1BitVector)
13789     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13790
13791   llvm_unreachable("Unimplemented!");
13792 }
13793
13794 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13795 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13796                                            const X86Subtarget &Subtarget,
13797                                            SelectionDAG &DAG) {
13798   SDValue Cond = Op.getOperand(0);
13799   SDValue LHS = Op.getOperand(1);
13800   SDValue RHS = Op.getOperand(2);
13801   SDLoc dl(Op);
13802   MVT VT = Op.getSimpleValueType();
13803
13804   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13805     return SDValue();
13806   auto *CondBV = cast<BuildVectorSDNode>(Cond);
13807
13808   // Only non-legal VSELECTs reach this lowering, convert those into generic
13809   // shuffles and re-use the shuffle lowering path for blends.
13810   SmallVector<int, 32> Mask;
13811   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13812     SDValue CondElt = CondBV->getOperand(i);
13813     Mask.push_back(
13814         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13815                                      : -1);
13816   }
13817   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13818 }
13819
13820 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13821   // A vselect where all conditions and data are constants can be optimized into
13822   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13823   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13824       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13825       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13826     return SDValue();
13827
13828   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
13829   // with patterns on the mask registers on AVX-512.
13830   if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
13831     return Op;
13832
13833   // Try to lower this to a blend-style vector shuffle. This can handle all
13834   // constant condition cases.
13835   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13836     return BlendOp;
13837
13838   // Variable blends are only legal from SSE4.1 onward.
13839   if (!Subtarget.hasSSE41())
13840     return SDValue();
13841
13842   SDLoc dl(Op);
13843   MVT VT = Op.getSimpleValueType();
13844
13845   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
13846   // into an i1 condition so that we can use the mask-based 512-bit blend
13847   // instructions.
13848   if (VT.getSizeInBits() == 512) {
13849     SDValue Cond = Op.getOperand(0);
13850     // The vNi1 condition case should be handled above as it can be trivially
13851     // lowered.
13852     assert(Cond.getValueType().getScalarSizeInBits() ==
13853                VT.getScalarSizeInBits() &&
13854            "Should have a size-matched integer condition!");
13855     // Build a mask by testing the condition against itself (tests for zero).
13856     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
13857     SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
13858     // Now return a new VSELECT using the mask.
13859     return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
13860   }
13861
13862   // Only some types will be legal on some subtargets. If we can emit a legal
13863   // VSELECT-matching blend, return Op, and but if we need to expand, return
13864   // a null value.
13865   switch (VT.SimpleTy) {
13866   default:
13867     // Most of the vector types have blends past SSE4.1.
13868     return Op;
13869
13870   case MVT::v32i8:
13871     // The byte blends for AVX vectors were introduced only in AVX2.
13872     if (Subtarget.hasAVX2())
13873       return Op;
13874
13875     return SDValue();
13876
13877   case MVT::v8i16:
13878   case MVT::v16i16:
13879     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13880     if (Subtarget.hasBWI() && Subtarget.hasVLX())
13881       return Op;
13882
13883     // FIXME: We should custom lower this by fixing the condition and using i8
13884     // blends.
13885     return SDValue();
13886   }
13887 }
13888
13889 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13890   MVT VT = Op.getSimpleValueType();
13891   SDLoc dl(Op);
13892
13893   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13894     return SDValue();
13895
13896   if (VT.getSizeInBits() == 8) {
13897     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13898                                   Op.getOperand(0), Op.getOperand(1));
13899     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13900                                   DAG.getValueType(VT));
13901     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13902   }
13903
13904   if (VT == MVT::f32) {
13905     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13906     // the result back to FR32 register. It's only worth matching if the
13907     // result has a single use which is a store or a bitcast to i32.  And in
13908     // the case of a store, it's not worth it if the index is a constant 0,
13909     // because a MOVSSmr can be used instead, which is smaller and faster.
13910     if (!Op.hasOneUse())
13911       return SDValue();
13912     SDNode *User = *Op.getNode()->use_begin();
13913     if ((User->getOpcode() != ISD::STORE ||
13914          isNullConstant(Op.getOperand(1))) &&
13915         (User->getOpcode() != ISD::BITCAST ||
13916          User->getValueType(0) != MVT::i32))
13917       return SDValue();
13918     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13919                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13920                                   Op.getOperand(1));
13921     return DAG.getBitcast(MVT::f32, Extract);
13922   }
13923
13924   if (VT == MVT::i32 || VT == MVT::i64) {
13925     // ExtractPS/pextrq works with constant index.
13926     if (isa<ConstantSDNode>(Op.getOperand(1)))
13927       return Op;
13928   }
13929
13930   return SDValue();
13931 }
13932
13933 /// Extract one bit from mask vector, like v16i1 or v8i1.
13934 /// AVX-512 feature.
13935 SDValue
13936 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13937   SDValue Vec = Op.getOperand(0);
13938   SDLoc dl(Vec);
13939   MVT VecVT = Vec.getSimpleValueType();
13940   SDValue Idx = Op.getOperand(1);
13941   MVT EltVT = Op.getSimpleValueType();
13942
13943   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13944          "Unexpected vector type in ExtractBitFromMaskVector");
13945
13946   // variable index can't be handled in mask registers,
13947   // extend vector to VR512/128
13948   if (!isa<ConstantSDNode>(Idx)) {
13949     unsigned NumElts = VecVT.getVectorNumElements();
13950     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
13951     // than extending to 128/256bit.
13952     unsigned VecSize = (NumElts <= 4 ? 128 : 512);
13953     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
13954     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
13955     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13956                               ExtVT.getVectorElementType(), Ext, Idx);
13957     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13958   }
13959
13960   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13961   if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13962       (VecVT.getVectorNumElements() < 8)) {
13963     // Use kshiftlw/rw instruction.
13964     VecVT = MVT::v16i1;
13965     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13966                       DAG.getUNDEF(VecVT),
13967                       Vec,
13968                       DAG.getIntPtrConstant(0, dl));
13969   }
13970   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13971   if (MaxSift - IdxVal)
13972     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
13973                       DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13974   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
13975                     DAG.getConstant(MaxSift, dl, MVT::i8));
13976   return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
13977                      DAG.getIntPtrConstant(0, dl));
13978 }
13979
13980 SDValue
13981 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13982                                            SelectionDAG &DAG) const {
13983   SDLoc dl(Op);
13984   SDValue Vec = Op.getOperand(0);
13985   MVT VecVT = Vec.getSimpleValueType();
13986   SDValue Idx = Op.getOperand(1);
13987
13988   if (VecVT.getVectorElementType() == MVT::i1)
13989     return ExtractBitFromMaskVector(Op, DAG);
13990
13991   if (!isa<ConstantSDNode>(Idx)) {
13992     // Its more profitable to go through memory (1 cycles throughput)
13993     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
13994     // IACA tool was used to get performance estimation
13995     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
13996     //
13997     // example : extractelement <16 x i8> %a, i32 %i
13998     //
13999     // Block Throughput: 3.00 Cycles
14000     // Throughput Bottleneck: Port5
14001     //
14002     // | Num Of |   Ports pressure in cycles  |    |
14003     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
14004     // ---------------------------------------------
14005     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
14006     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
14007     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
14008     // Total Num Of Uops: 4
14009     //
14010     //
14011     // Block Throughput: 1.00 Cycles
14012     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14013     //
14014     // |    |  Ports pressure in cycles   |  |
14015     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
14016     // ---------------------------------------------------------
14017     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14018     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
14019     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
14020     // Total Num Of Uops: 4
14021
14022     return SDValue();
14023   }
14024
14025   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14026
14027   // If this is a 256-bit vector result, first extract the 128-bit vector and
14028   // then extract the element from the 128-bit vector.
14029   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14030     // Get the 128-bit vector.
14031     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14032     MVT EltVT = VecVT.getVectorElementType();
14033
14034     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14035     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14036
14037     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14038     // this can be done with a mask.
14039     IdxVal &= ElemsPerChunk - 1;
14040     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14041                        DAG.getConstant(IdxVal, dl, MVT::i32));
14042   }
14043
14044   assert(VecVT.is128BitVector() && "Unexpected vector length");
14045
14046   MVT VT = Op.getSimpleValueType();
14047
14048   if (VT.getSizeInBits() == 16) {
14049     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14050     // we're going to zero extend the register or fold the store (SSE41 only).
14051     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14052         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14053       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14054                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14055                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
14056
14057     // Transform it so it match pextrw which produces a 32-bit result.
14058     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14059                                   Op.getOperand(0), Op.getOperand(1));
14060     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14061                                   DAG.getValueType(VT));
14062     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14063   }
14064
14065   if (Subtarget.hasSSE41())
14066     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14067       return Res;
14068
14069   // TODO: We only extract a single element from v16i8, we can probably afford
14070   // to be more aggressive here before using the default approach of spilling to
14071   // stack.
14072   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14073     // Extract either the lowest i32 or any i16, and extract the sub-byte.
14074     int DWordIdx = IdxVal / 4;
14075     if (DWordIdx == 0) {
14076       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14077                                 DAG.getBitcast(MVT::v4i32, Vec),
14078                                 DAG.getIntPtrConstant(DWordIdx, dl));
14079       int ShiftVal = (IdxVal % 4) * 8;
14080       if (ShiftVal != 0)
14081         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14082                           DAG.getConstant(ShiftVal, dl, MVT::i32));
14083       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14084     }
14085
14086     int WordIdx = IdxVal / 2;
14087     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14088                               DAG.getBitcast(MVT::v8i16, Vec),
14089                               DAG.getIntPtrConstant(WordIdx, dl));
14090     int ShiftVal = (IdxVal % 2) * 8;
14091     if (ShiftVal != 0)
14092       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14093                         DAG.getConstant(ShiftVal, dl, MVT::i16));
14094     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14095   }
14096
14097   if (VT.getSizeInBits() == 32) {
14098     if (IdxVal == 0)
14099       return Op;
14100
14101     // SHUFPS the element to the lowest double word, then movss.
14102     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14103     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14104     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14105                        DAG.getIntPtrConstant(0, dl));
14106   }
14107
14108   if (VT.getSizeInBits() == 64) {
14109     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14110     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14111     //        to match extract_elt for f64.
14112     if (IdxVal == 0)
14113       return Op;
14114
14115     // UNPCKHPD the element to the lowest double word, then movsd.
14116     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14117     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14118     int Mask[2] = { 1, -1 };
14119     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14120     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14121                        DAG.getIntPtrConstant(0, dl));
14122   }
14123
14124   return SDValue();
14125 }
14126
14127 /// Insert one bit to mask vector, like v16i1 or v8i1.
14128 /// AVX-512 feature.
14129 SDValue
14130 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14131   SDLoc dl(Op);
14132   SDValue Vec = Op.getOperand(0);
14133   SDValue Elt = Op.getOperand(1);
14134   SDValue Idx = Op.getOperand(2);
14135   MVT VecVT = Vec.getSimpleValueType();
14136
14137   if (!isa<ConstantSDNode>(Idx)) {
14138     // Non constant index. Extend source and destination,
14139     // insert element and then truncate the result.
14140     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
14141     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
14142     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14143       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14144       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14145     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14146   }
14147
14148   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14149   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14150   unsigned NumElems = VecVT.getVectorNumElements();
14151
14152   if(Vec.isUndef()) {
14153     if (IdxVal)
14154       EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14155                              DAG.getConstant(IdxVal, dl, MVT::i8));
14156     return EltInVec;
14157   }
14158
14159   // Insertion of one bit into first position
14160   if (IdxVal == 0 ) {
14161     // Clean top bits of vector.
14162     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14163                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
14164     EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14165                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
14166     // Clean the first bit in source vector.
14167     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14168                       DAG.getConstant(1 , dl, MVT::i8));
14169     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14170                       DAG.getConstant(1, dl, MVT::i8));
14171
14172     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14173   }
14174   // Insertion of one bit into last position
14175   if (IdxVal == NumElems -1) {
14176     // Move the bit to the last position inside the vector.
14177     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14178                            DAG.getConstant(IdxVal, dl, MVT::i8));
14179     // Clean the last bit in the source vector.
14180     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14181                            DAG.getConstant(1, dl, MVT::i8));
14182     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14183                            DAG.getConstant(1 , dl, MVT::i8));
14184
14185     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14186   }
14187
14188   // Use shuffle to insert element.
14189   SmallVector<int, 64> MaskVec(NumElems);
14190   for (unsigned i = 0; i != NumElems; ++i)
14191     MaskVec[i] = (i == IdxVal) ? NumElems : i;
14192
14193   return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14194 }
14195
14196 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14197                                                   SelectionDAG &DAG) const {
14198   MVT VT = Op.getSimpleValueType();
14199   MVT EltVT = VT.getVectorElementType();
14200   unsigned NumElts = VT.getVectorNumElements();
14201
14202   if (EltVT == MVT::i1)
14203     return InsertBitToMaskVector(Op, DAG);
14204
14205   SDLoc dl(Op);
14206   SDValue N0 = Op.getOperand(0);
14207   SDValue N1 = Op.getOperand(1);
14208   SDValue N2 = Op.getOperand(2);
14209   if (!isa<ConstantSDNode>(N2))
14210     return SDValue();
14211   auto *N2C = cast<ConstantSDNode>(N2);
14212   unsigned IdxVal = N2C->getZExtValue();
14213
14214   bool IsZeroElt = X86::isZeroNode(N1);
14215   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14216
14217   // If we are inserting a element, see if we can do this more efficiently with
14218   // a blend shuffle with a rematerializable vector than a costly integer
14219   // insertion.
14220   // TODO: pre-SSE41 targets will tend to use bit masking - this could still
14221   // be beneficial if we are inserting several zeros and can combine the masks.
14222   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
14223     SmallVector<int, 8> BlendMask;
14224     for (unsigned i = 0; i != NumElts; ++i)
14225       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14226     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14227                                   : DAG.getConstant(-1, dl, VT);
14228     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14229   }
14230
14231   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14232   // into that, and then insert the subvector back into the result.
14233   if (VT.is256BitVector() || VT.is512BitVector()) {
14234     // With a 256-bit vector, we can insert into the zero element efficiently
14235     // using a blend if we have AVX or AVX2 and the right data type.
14236     if (VT.is256BitVector() && IdxVal == 0) {
14237       // TODO: It is worthwhile to cast integer to floating point and back
14238       // and incur a domain crossing penalty if that's what we'll end up
14239       // doing anyway after extracting to a 128-bit vector.
14240       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14241           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14242         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14243         N2 = DAG.getIntPtrConstant(1, dl);
14244         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14245       }
14246     }
14247
14248     // Get the desired 128-bit vector chunk.
14249     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14250
14251     // Insert the element into the desired chunk.
14252     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14253     assert(isPowerOf2_32(NumEltsIn128));
14254     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14255     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14256
14257     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14258                     DAG.getConstant(IdxIn128, dl, MVT::i32));
14259
14260     // Insert the changed part back into the bigger vector
14261     return insert128BitVector(N0, V, IdxVal, DAG, dl);
14262   }
14263   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14264
14265   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14266   // argument. SSE41 required for pinsrb.
14267   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14268     unsigned Opc;
14269     if (VT == MVT::v8i16) {
14270       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14271       Opc = X86ISD::PINSRW;
14272     } else {
14273       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14274       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14275       Opc = X86ISD::PINSRB;
14276     }
14277
14278     if (N1.getValueType() != MVT::i32)
14279       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14280     if (N2.getValueType() != MVT::i32)
14281       N2 = DAG.getIntPtrConstant(IdxVal, dl);
14282     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14283   }
14284
14285   if (Subtarget.hasSSE41()) {
14286     if (EltVT == MVT::f32) {
14287       // Bits [7:6] of the constant are the source select. This will always be
14288       //   zero here. The DAG Combiner may combine an extract_elt index into
14289       //   these bits. For example (insert (extract, 3), 2) could be matched by
14290       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14291       // Bits [5:4] of the constant are the destination select. This is the
14292       //   value of the incoming immediate.
14293       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14294       //   combine either bitwise AND or insert of float 0.0 to set these bits.
14295
14296       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14297       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14298         // If this is an insertion of 32-bits into the low 32-bits of
14299         // a vector, we prefer to generate a blend with immediate rather
14300         // than an insertps. Blends are simpler operations in hardware and so
14301         // will always have equal or better performance than insertps.
14302         // But if optimizing for size and there's a load folding opportunity,
14303         // generate insertps because blendps does not have a 32-bit memory
14304         // operand form.
14305         N2 = DAG.getIntPtrConstant(1, dl);
14306         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14307         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14308       }
14309       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14310       // Create this as a scalar to vector..
14311       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14312       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14313     }
14314
14315     // PINSR* works with constant index.
14316     if (EltVT == MVT::i32 || EltVT == MVT::i64)
14317       return Op;
14318   }
14319
14320   return SDValue();
14321 }
14322
14323 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14324                                      SelectionDAG &DAG) {
14325   SDLoc dl(Op);
14326   MVT OpVT = Op.getSimpleValueType();
14327
14328   // It's always cheaper to replace a xor+movd with xorps and simplifies further
14329   // combines.
14330   if (X86::isZeroNode(Op.getOperand(0)))
14331     return getZeroVector(OpVT, Subtarget, DAG, dl);
14332
14333   // If this is a 256-bit vector result, first insert into a 128-bit
14334   // vector and then insert into the 256-bit vector.
14335   if (!OpVT.is128BitVector()) {
14336     // Insert into a 128-bit vector.
14337     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14338     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14339                                  OpVT.getVectorNumElements() / SizeFactor);
14340
14341     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14342
14343     // Insert the 128-bit vector.
14344     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14345   }
14346   assert(OpVT.is128BitVector() && "Expected an SSE type!");
14347
14348   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14349   if (OpVT == MVT::v4i32)
14350     return Op;
14351
14352   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14353   return DAG.getBitcast(
14354       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14355 }
14356
14357 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
14358 // a simple subregister reference or explicit instructions to grab
14359 // upper bits of a vector.
14360 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14361                                       SelectionDAG &DAG) {
14362   assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14363
14364   SDLoc dl(Op);
14365   SDValue In =  Op.getOperand(0);
14366   SDValue Idx = Op.getOperand(1);
14367   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14368   MVT ResVT = Op.getSimpleValueType();
14369
14370   assert((In.getSimpleValueType().is256BitVector() ||
14371           In.getSimpleValueType().is512BitVector()) &&
14372          "Can only extract from 256-bit or 512-bit vectors");
14373
14374   // If the input is a buildvector just emit a smaller one.
14375   unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14376   if (In.getOpcode() == ISD::BUILD_VECTOR)
14377     return DAG.getBuildVector(
14378         ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14379
14380   // Everything else is legal.
14381   return Op;
14382 }
14383
14384 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
14385 // simple superregister reference or explicit instructions to insert
14386 // the upper bits of a vector.
14387 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14388                                      SelectionDAG &DAG) {
14389   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14390
14391   return insert1BitVector(Op, DAG, Subtarget);
14392 }
14393
14394 // Returns the appropriate wrapper opcode for a global reference.
14395 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14396   // References to absolute symbols are never PC-relative.
14397   if (GV && GV->isAbsoluteSymbolRef())
14398     return X86ISD::Wrapper;
14399
14400   CodeModel::Model M = getTargetMachine().getCodeModel();
14401   if (Subtarget.isPICStyleRIPRel() &&
14402       (M == CodeModel::Small || M == CodeModel::Kernel))
14403     return X86ISD::WrapperRIP;
14404
14405   return X86ISD::Wrapper;
14406 }
14407
14408 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14409 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14410 // one of the above mentioned nodes. It has to be wrapped because otherwise
14411 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14412 // be used to form addressing mode. These wrapped nodes will be selected
14413 // into MOV32ri.
14414 SDValue
14415 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14416   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14417
14418   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14419   // global base reg.
14420   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14421
14422   auto PtrVT = getPointerTy(DAG.getDataLayout());
14423   SDValue Result = DAG.getTargetConstantPool(
14424       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14425   SDLoc DL(CP);
14426   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14427   // With PIC, the address is actually $g + Offset.
14428   if (OpFlag) {
14429     Result =
14430         DAG.getNode(ISD::ADD, DL, PtrVT,
14431                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14432   }
14433
14434   return Result;
14435 }
14436
14437 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14438   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14439
14440   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14441   // global base reg.
14442   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14443
14444   auto PtrVT = getPointerTy(DAG.getDataLayout());
14445   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14446   SDLoc DL(JT);
14447   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14448
14449   // With PIC, the address is actually $g + Offset.
14450   if (OpFlag)
14451     Result =
14452         DAG.getNode(ISD::ADD, DL, PtrVT,
14453                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14454
14455   return Result;
14456 }
14457
14458 SDValue
14459 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14460   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14461
14462   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14463   // global base reg.
14464   const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14465   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14466
14467   auto PtrVT = getPointerTy(DAG.getDataLayout());
14468   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14469
14470   SDLoc DL(Op);
14471   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14472
14473   // With PIC, the address is actually $g + Offset.
14474   if (isPositionIndependent() && !Subtarget.is64Bit()) {
14475     Result =
14476         DAG.getNode(ISD::ADD, DL, PtrVT,
14477                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14478   }
14479
14480   // For symbols that require a load from a stub to get the address, emit the
14481   // load.
14482   if (isGlobalStubReference(OpFlag))
14483     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14484                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14485
14486   return Result;
14487 }
14488
14489 SDValue
14490 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14491   // Create the TargetBlockAddressAddress node.
14492   unsigned char OpFlags =
14493     Subtarget.classifyBlockAddressReference();
14494   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14495   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14496   SDLoc dl(Op);
14497   auto PtrVT = getPointerTy(DAG.getDataLayout());
14498   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14499   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14500
14501   // With PIC, the address is actually $g + Offset.
14502   if (isGlobalRelativeToPICBase(OpFlags)) {
14503     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14504                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14505   }
14506
14507   return Result;
14508 }
14509
14510 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14511                                               const SDLoc &dl, int64_t Offset,
14512                                               SelectionDAG &DAG) const {
14513   // Create the TargetGlobalAddress node, folding in the constant
14514   // offset if it is legal.
14515   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14516   CodeModel::Model M = DAG.getTarget().getCodeModel();
14517   auto PtrVT = getPointerTy(DAG.getDataLayout());
14518   SDValue Result;
14519   if (OpFlags == X86II::MO_NO_FLAG &&
14520       X86::isOffsetSuitableForCodeModel(Offset, M)) {
14521     // A direct static reference to a global.
14522     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14523     Offset = 0;
14524   } else {
14525     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14526   }
14527
14528   Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14529
14530   // With PIC, the address is actually $g + Offset.
14531   if (isGlobalRelativeToPICBase(OpFlags)) {
14532     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14533                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14534   }
14535
14536   // For globals that require a load from a stub to get the address, emit the
14537   // load.
14538   if (isGlobalStubReference(OpFlags))
14539     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14540                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14541
14542   // If there was a non-zero offset that we didn't fold, create an explicit
14543   // addition for it.
14544   if (Offset != 0)
14545     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14546                          DAG.getConstant(Offset, dl, PtrVT));
14547
14548   return Result;
14549 }
14550
14551 SDValue
14552 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14553   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14554   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14555   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14556 }
14557
14558 static SDValue
14559 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14560            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14561            unsigned char OperandFlags, bool LocalDynamic = false) {
14562   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14563   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14564   SDLoc dl(GA);
14565   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14566                                            GA->getValueType(0),
14567                                            GA->getOffset(),
14568                                            OperandFlags);
14569
14570   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14571                                            : X86ISD::TLSADDR;
14572
14573   if (InFlag) {
14574     SDValue Ops[] = { Chain,  TGA, *InFlag };
14575     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14576   } else {
14577     SDValue Ops[]  = { Chain, TGA };
14578     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14579   }
14580
14581   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14582   MFI.setAdjustsStack(true);
14583   MFI.setHasCalls(true);
14584
14585   SDValue Flag = Chain.getValue(1);
14586   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14587 }
14588
14589 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14590 static SDValue
14591 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14592                                 const EVT PtrVT) {
14593   SDValue InFlag;
14594   SDLoc dl(GA);  // ? function entry point might be better
14595   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14596                                    DAG.getNode(X86ISD::GlobalBaseReg,
14597                                                SDLoc(), PtrVT), InFlag);
14598   InFlag = Chain.getValue(1);
14599
14600   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14601 }
14602
14603 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14604 static SDValue
14605 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14606                                 const EVT PtrVT) {
14607   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14608                     X86::RAX, X86II::MO_TLSGD);
14609 }
14610
14611 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14612                                            SelectionDAG &DAG,
14613                                            const EVT PtrVT,
14614                                            bool is64Bit) {
14615   SDLoc dl(GA);
14616
14617   // Get the start address of the TLS block for this module.
14618   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14619       .getInfo<X86MachineFunctionInfo>();
14620   MFI->incNumLocalDynamicTLSAccesses();
14621
14622   SDValue Base;
14623   if (is64Bit) {
14624     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14625                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
14626   } else {
14627     SDValue InFlag;
14628     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14629         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14630     InFlag = Chain.getValue(1);
14631     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14632                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14633   }
14634
14635   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14636   // of Base.
14637
14638   // Build x@dtpoff.
14639   unsigned char OperandFlags = X86II::MO_DTPOFF;
14640   unsigned WrapperKind = X86ISD::Wrapper;
14641   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14642                                            GA->getValueType(0),
14643                                            GA->getOffset(), OperandFlags);
14644   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14645
14646   // Add x@dtpoff with the base.
14647   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14648 }
14649
14650 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14651 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14652                                    const EVT PtrVT, TLSModel::Model model,
14653                                    bool is64Bit, bool isPIC) {
14654   SDLoc dl(GA);
14655
14656   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14657   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14658                                                          is64Bit ? 257 : 256));
14659
14660   SDValue ThreadPointer =
14661       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14662                   MachinePointerInfo(Ptr));
14663
14664   unsigned char OperandFlags = 0;
14665   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
14666   // initialexec.
14667   unsigned WrapperKind = X86ISD::Wrapper;
14668   if (model == TLSModel::LocalExec) {
14669     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14670   } else if (model == TLSModel::InitialExec) {
14671     if (is64Bit) {
14672       OperandFlags = X86II::MO_GOTTPOFF;
14673       WrapperKind = X86ISD::WrapperRIP;
14674     } else {
14675       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14676     }
14677   } else {
14678     llvm_unreachable("Unexpected model");
14679   }
14680
14681   // emit "addl x@ntpoff,%eax" (local exec)
14682   // or "addl x@indntpoff,%eax" (initial exec)
14683   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14684   SDValue TGA =
14685       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14686                                  GA->getOffset(), OperandFlags);
14687   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14688
14689   if (model == TLSModel::InitialExec) {
14690     if (isPIC && !is64Bit) {
14691       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14692                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14693                            Offset);
14694     }
14695
14696     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14697                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14698   }
14699
14700   // The address of the thread local variable is the add of the thread
14701   // pointer with the offset of the variable.
14702   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14703 }
14704
14705 SDValue
14706 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14707
14708   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14709
14710   if (DAG.getTarget().Options.EmulatedTLS)
14711     return LowerToTLSEmulatedModel(GA, DAG);
14712
14713   const GlobalValue *GV = GA->getGlobal();
14714   auto PtrVT = getPointerTy(DAG.getDataLayout());
14715   bool PositionIndependent = isPositionIndependent();
14716
14717   if (Subtarget.isTargetELF()) {
14718     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14719     switch (model) {
14720       case TLSModel::GeneralDynamic:
14721         if (Subtarget.is64Bit())
14722           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14723         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14724       case TLSModel::LocalDynamic:
14725         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14726                                            Subtarget.is64Bit());
14727       case TLSModel::InitialExec:
14728       case TLSModel::LocalExec:
14729         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14730                                    PositionIndependent);
14731     }
14732     llvm_unreachable("Unknown TLS model.");
14733   }
14734
14735   if (Subtarget.isTargetDarwin()) {
14736     // Darwin only has one model of TLS.  Lower to that.
14737     unsigned char OpFlag = 0;
14738     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14739                            X86ISD::WrapperRIP : X86ISD::Wrapper;
14740
14741     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14742     // global base reg.
14743     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14744     if (PIC32)
14745       OpFlag = X86II::MO_TLVP_PIC_BASE;
14746     else
14747       OpFlag = X86II::MO_TLVP;
14748     SDLoc DL(Op);
14749     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14750                                                 GA->getValueType(0),
14751                                                 GA->getOffset(), OpFlag);
14752     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14753
14754     // With PIC32, the address is actually $g + Offset.
14755     if (PIC32)
14756       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14757                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14758                            Offset);
14759
14760     // Lowering the machine isd will make sure everything is in the right
14761     // location.
14762     SDValue Chain = DAG.getEntryNode();
14763     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14764     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14765     SDValue Args[] = { Chain, Offset };
14766     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14767     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14768                                DAG.getIntPtrConstant(0, DL, true),
14769                                Chain.getValue(1), DL);
14770
14771     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14772     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14773     MFI.setAdjustsStack(true);
14774
14775     // And our return value (tls address) is in the standard call return value
14776     // location.
14777     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14778     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14779   }
14780
14781   if (Subtarget.isTargetKnownWindowsMSVC() ||
14782       Subtarget.isTargetWindowsItanium() ||
14783       Subtarget.isTargetWindowsGNU()) {
14784     // Just use the implicit TLS architecture
14785     // Need to generate something similar to:
14786     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14787     //                                  ; from TEB
14788     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
14789     //   mov     rcx, qword [rdx+rcx*8]
14790     //   mov     eax, .tls$:tlsvar
14791     //   [rax+rcx] contains the address
14792     // Windows 64bit: gs:0x58
14793     // Windows 32bit: fs:__tls_array
14794
14795     SDLoc dl(GA);
14796     SDValue Chain = DAG.getEntryNode();
14797
14798     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14799     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14800     // use its literal value of 0x2C.
14801     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14802                                         ? Type::getInt8PtrTy(*DAG.getContext(),
14803                                                              256)
14804                                         : Type::getInt32PtrTy(*DAG.getContext(),
14805                                                               257));
14806
14807     SDValue TlsArray = Subtarget.is64Bit()
14808                            ? DAG.getIntPtrConstant(0x58, dl)
14809                            : (Subtarget.isTargetWindowsGNU()
14810                                   ? DAG.getIntPtrConstant(0x2C, dl)
14811                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
14812
14813     SDValue ThreadPointer =
14814         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14815
14816     SDValue res;
14817     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14818       res = ThreadPointer;
14819     } else {
14820       // Load the _tls_index variable
14821       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14822       if (Subtarget.is64Bit())
14823         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14824                              MachinePointerInfo(), MVT::i32);
14825       else
14826         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14827
14828       auto &DL = DAG.getDataLayout();
14829       SDValue Scale =
14830           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14831       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14832
14833       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14834     }
14835
14836     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14837
14838     // Get the offset of start of .tls section
14839     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14840                                              GA->getValueType(0),
14841                                              GA->getOffset(), X86II::MO_SECREL);
14842     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14843
14844     // The address of the thread local variable is the add of the thread
14845     // pointer with the offset of the variable.
14846     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14847   }
14848
14849   llvm_unreachable("TLS not implemented for this target.");
14850 }
14851
14852 /// Lower SRA_PARTS and friends, which return two i32 values
14853 /// and take a 2 x i32 value to shift plus a shift amount.
14854 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14855   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14856   MVT VT = Op.getSimpleValueType();
14857   unsigned VTBits = VT.getSizeInBits();
14858   SDLoc dl(Op);
14859   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14860   SDValue ShOpLo = Op.getOperand(0);
14861   SDValue ShOpHi = Op.getOperand(1);
14862   SDValue ShAmt  = Op.getOperand(2);
14863   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14864   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14865   // during isel.
14866   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14867                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
14868   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14869                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
14870                        : DAG.getConstant(0, dl, VT);
14871
14872   SDValue Tmp2, Tmp3;
14873   if (Op.getOpcode() == ISD::SHL_PARTS) {
14874     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14875     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14876   } else {
14877     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14878     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14879   }
14880
14881   // If the shift amount is larger or equal than the width of a part we can't
14882   // rely on the results of shld/shrd. Insert a test and select the appropriate
14883   // values for large shift amounts.
14884   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14885                                 DAG.getConstant(VTBits, dl, MVT::i8));
14886   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14887                              AndNode, DAG.getConstant(0, dl, MVT::i8));
14888
14889   SDValue Hi, Lo;
14890   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14891   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14892   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14893
14894   if (Op.getOpcode() == ISD::SHL_PARTS) {
14895     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14896     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14897   } else {
14898     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14899     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14900   }
14901
14902   SDValue Ops[2] = { Lo, Hi };
14903   return DAG.getMergeValues(Ops, dl);
14904 }
14905
14906 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14907                                            SelectionDAG &DAG) const {
14908   SDValue Src = Op.getOperand(0);
14909   MVT SrcVT = Src.getSimpleValueType();
14910   MVT VT = Op.getSimpleValueType();
14911   SDLoc dl(Op);
14912
14913   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14914   if (SrcVT.isVector()) {
14915     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14916       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14917                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14918                                      DAG.getUNDEF(SrcVT)));
14919     }
14920     if (SrcVT.getVectorElementType() == MVT::i1) {
14921       if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14922         return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14923                            DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14924       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14925       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14926                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14927     }
14928     return SDValue();
14929   }
14930
14931   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14932          "Unknown SINT_TO_FP to lower!");
14933
14934   // These are really Legal; return the operand so the caller accepts it as
14935   // Legal.
14936   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14937     return Op;
14938   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14939       Subtarget.is64Bit()) {
14940     return Op;
14941   }
14942
14943   SDValue ValueToStore = Op.getOperand(0);
14944   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14945       !Subtarget.is64Bit())
14946     // Bitcasting to f64 here allows us to do a single 64-bit store from
14947     // an SSE register, avoiding the store forwarding penalty that would come
14948     // with two 32-bit stores.
14949     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14950
14951   unsigned Size = SrcVT.getSizeInBits()/8;
14952   MachineFunction &MF = DAG.getMachineFunction();
14953   auto PtrVT = getPointerTy(MF.getDataLayout());
14954   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14955   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14956   SDValue Chain = DAG.getStore(
14957       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14958       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14959   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14960 }
14961
14962 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14963                                      SDValue StackSlot,
14964                                      SelectionDAG &DAG) const {
14965   // Build the FILD
14966   SDLoc DL(Op);
14967   SDVTList Tys;
14968   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14969   if (useSSE)
14970     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14971   else
14972     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14973
14974   unsigned ByteSize = SrcVT.getSizeInBits()/8;
14975
14976   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14977   MachineMemOperand *MMO;
14978   if (FI) {
14979     int SSFI = FI->getIndex();
14980     MMO = DAG.getMachineFunction().getMachineMemOperand(
14981         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14982         MachineMemOperand::MOLoad, ByteSize, ByteSize);
14983   } else {
14984     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14985     StackSlot = StackSlot.getOperand(1);
14986   }
14987   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14988   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14989                                            X86ISD::FILD, DL,
14990                                            Tys, Ops, SrcVT, MMO);
14991
14992   if (useSSE) {
14993     Chain = Result.getValue(1);
14994     SDValue InFlag = Result.getValue(2);
14995
14996     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14997     // shouldn't be necessary except that RFP cannot be live across
14998     // multiple blocks. When stackifier is fixed, they can be uncoupled.
14999     MachineFunction &MF = DAG.getMachineFunction();
15000     unsigned SSFISize = Op.getValueSizeInBits()/8;
15001     int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15002     auto PtrVT = getPointerTy(MF.getDataLayout());
15003     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15004     Tys = DAG.getVTList(MVT::Other);
15005     SDValue Ops[] = {
15006       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15007     };
15008     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15009         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15010         MachineMemOperand::MOStore, SSFISize, SSFISize);
15011
15012     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15013                                     Ops, Op.getValueType(), MMO);
15014     Result = DAG.getLoad(
15015         Op.getValueType(), DL, Chain, StackSlot,
15016         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15017   }
15018
15019   return Result;
15020 }
15021
15022 /// 64-bit unsigned integer to double expansion.
15023 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15024                                                SelectionDAG &DAG) const {
15025   // This algorithm is not obvious. Here it is what we're trying to output:
15026   /*
15027      movq       %rax,  %xmm0
15028      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15029      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15030      #ifdef __SSE3__
15031        haddpd   %xmm0, %xmm0
15032      #else
15033        pshufd   $0x4e, %xmm0, %xmm1
15034        addpd    %xmm1, %xmm0
15035      #endif
15036   */
15037
15038   SDLoc dl(Op);
15039   LLVMContext *Context = DAG.getContext();
15040
15041   // Build some magic constants.
15042   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15043   Constant *C0 = ConstantDataVector::get(*Context, CV0);
15044   auto PtrVT = getPointerTy(DAG.getDataLayout());
15045   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15046
15047   SmallVector<Constant*,2> CV1;
15048   CV1.push_back(
15049     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15050                                       APInt(64, 0x4330000000000000ULL))));
15051   CV1.push_back(
15052     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15053                                       APInt(64, 0x4530000000000000ULL))));
15054   Constant *C1 = ConstantVector::get(CV1);
15055   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15056
15057   // Load the 64-bit value into an XMM register.
15058   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15059                             Op.getOperand(0));
15060   SDValue CLod0 =
15061       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15062                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15063                   /* Alignment = */ 16);
15064   SDValue Unpck1 =
15065       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15066
15067   SDValue CLod1 =
15068       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15069                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15070                   /* Alignment = */ 16);
15071   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15072   // TODO: Are there any fast-math-flags to propagate here?
15073   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15074   SDValue Result;
15075
15076   if (Subtarget.hasSSE3()) {
15077     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15078     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15079   } else {
15080     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15081     SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15082     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15083                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15084   }
15085
15086   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15087                      DAG.getIntPtrConstant(0, dl));
15088 }
15089
15090 /// 32-bit unsigned integer to float expansion.
15091 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15092                                                SelectionDAG &DAG) const {
15093   SDLoc dl(Op);
15094   // FP constant to bias correct the final result.
15095   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15096                                    MVT::f64);
15097
15098   // Load the 32-bit value into an XMM register.
15099   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15100                              Op.getOperand(0));
15101
15102   // Zero out the upper parts of the register.
15103   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15104
15105   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15106                      DAG.getBitcast(MVT::v2f64, Load),
15107                      DAG.getIntPtrConstant(0, dl));
15108
15109   // Or the load with the bias.
15110   SDValue Or = DAG.getNode(
15111       ISD::OR, dl, MVT::v2i64,
15112       DAG.getBitcast(MVT::v2i64,
15113                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15114       DAG.getBitcast(MVT::v2i64,
15115                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15116   Or =
15117       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15118                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15119
15120   // Subtract the bias.
15121   // TODO: Are there any fast-math-flags to propagate here?
15122   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15123
15124   // Handle final rounding.
15125   MVT DestVT = Op.getSimpleValueType();
15126
15127   if (DestVT.bitsLT(MVT::f64))
15128     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15129                        DAG.getIntPtrConstant(0, dl));
15130   if (DestVT.bitsGT(MVT::f64))
15131     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15132
15133   // Handle final rounding.
15134   return Sub;
15135 }
15136
15137 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15138                                      const X86Subtarget &Subtarget, SDLoc &DL) {
15139   if (Op.getSimpleValueType() != MVT::v2f64)
15140     return SDValue();
15141
15142   SDValue N0 = Op.getOperand(0);
15143   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15144
15145   // Legalize to v4i32 type.
15146   N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15147                    DAG.getUNDEF(MVT::v2i32));
15148
15149   if (Subtarget.hasAVX512())
15150     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15151
15152   // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15153   // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15154   SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15155   SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15156
15157   // Two to the power of half-word-size.
15158   SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15159
15160   // Clear upper part of LO, lower HI.
15161   SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15162   SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15163
15164   SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15165           fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15166   SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15167
15168   // Add the two halves.
15169   return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15170 }
15171
15172 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15173                                      const X86Subtarget &Subtarget) {
15174   // The algorithm is the following:
15175   // #ifdef __SSE4_1__
15176   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15177   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15178   //                                 (uint4) 0x53000000, 0xaa);
15179   // #else
15180   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15181   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15182   // #endif
15183   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15184   //     return (float4) lo + fhi;
15185
15186   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15187   // reassociate the two FADDs, and if we do that, the algorithm fails
15188   // spectacularly (PR24512).
15189   // FIXME: If we ever have some kind of Machine FMF, this should be marked
15190   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15191   // there's also the MachineCombiner reassociations happening on Machine IR.
15192   if (DAG.getTarget().Options.UnsafeFPMath)
15193     return SDValue();
15194
15195   SDLoc DL(Op);
15196   SDValue V = Op->getOperand(0);
15197   MVT VecIntVT = V.getSimpleValueType();
15198   bool Is128 = VecIntVT == MVT::v4i32;
15199   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15200   // If we convert to something else than the supported type, e.g., to v4f64,
15201   // abort early.
15202   if (VecFloatVT != Op->getSimpleValueType(0))
15203     return SDValue();
15204
15205   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15206          "Unsupported custom type");
15207
15208   // In the #idef/#else code, we have in common:
15209   // - The vector of constants:
15210   // -- 0x4b000000
15211   // -- 0x53000000
15212   // - A shift:
15213   // -- v >> 16
15214
15215   // Create the splat vector for 0x4b000000.
15216   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15217   // Create the splat vector for 0x53000000.
15218   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15219
15220   // Create the right shift.
15221   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15222   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15223
15224   SDValue Low, High;
15225   if (Subtarget.hasSSE41()) {
15226     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15227     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15228     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15229     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15230     // Low will be bitcasted right away, so do not bother bitcasting back to its
15231     // original type.
15232     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15233                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15234     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15235     //                                 (uint4) 0x53000000, 0xaa);
15236     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15237     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15238     // High will be bitcasted right away, so do not bother bitcasting back to
15239     // its original type.
15240     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15241                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15242   } else {
15243     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15244     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15245     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15246     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15247
15248     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15249     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15250   }
15251
15252   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15253   SDValue VecCstFAdd = DAG.getConstantFP(
15254       APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15255
15256   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15257   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15258   // TODO: Are there any fast-math-flags to propagate here?
15259   SDValue FHigh =
15260       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15261   //     return (float4) lo + fhi;
15262   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15263   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15264 }
15265
15266 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15267                                                SelectionDAG &DAG) const {
15268   SDValue N0 = Op.getOperand(0);
15269   MVT SrcVT = N0.getSimpleValueType();
15270   SDLoc dl(Op);
15271
15272   if (SrcVT.getVectorElementType() == MVT::i1) {
15273     if (SrcVT == MVT::v2i1)
15274       return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15275                          DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15276     MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15277     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15278                        DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15279   }
15280
15281   switch (SrcVT.SimpleTy) {
15282   default:
15283     llvm_unreachable("Custom UINT_TO_FP is not supported!");
15284   case MVT::v4i8:
15285   case MVT::v4i16:
15286   case MVT::v8i8:
15287   case MVT::v8i16: {
15288     MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15289     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15290                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15291   }
15292   case MVT::v2i32:
15293     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15294   case MVT::v4i32:
15295   case MVT::v8i32:
15296     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15297   case MVT::v16i8:
15298   case MVT::v16i16:
15299     assert(Subtarget.hasAVX512());
15300     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15301                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15302   }
15303 }
15304
15305 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15306                                            SelectionDAG &DAG) const {
15307   SDValue N0 = Op.getOperand(0);
15308   SDLoc dl(Op);
15309   auto PtrVT = getPointerTy(DAG.getDataLayout());
15310
15311   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15312   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15313   // the optimization here.
15314   if (DAG.SignBitIsZero(N0))
15315     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15316
15317   if (Op.getSimpleValueType().isVector())
15318     return lowerUINT_TO_FP_vec(Op, DAG);
15319
15320   MVT SrcVT = N0.getSimpleValueType();
15321   MVT DstVT = Op.getSimpleValueType();
15322
15323   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15324       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15325     // Conversions from unsigned i32 to f32/f64 are legal,
15326     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
15327     return Op;
15328   }
15329
15330   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15331     return LowerUINT_TO_FP_i64(Op, DAG);
15332   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15333     return LowerUINT_TO_FP_i32(Op, DAG);
15334   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15335     return SDValue();
15336
15337   // Make a 64-bit buffer, and use it to build an FILD.
15338   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15339   if (SrcVT == MVT::i32) {
15340     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15341     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15342                                   StackSlot, MachinePointerInfo());
15343     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15344                                   OffsetSlot, MachinePointerInfo());
15345     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15346     return Fild;
15347   }
15348
15349   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15350   SDValue ValueToStore = Op.getOperand(0);
15351   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15352     // Bitcasting to f64 here allows us to do a single 64-bit store from
15353     // an SSE register, avoiding the store forwarding penalty that would come
15354     // with two 32-bit stores.
15355     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15356   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15357                                MachinePointerInfo());
15358   // For i64 source, we need to add the appropriate power of 2 if the input
15359   // was negative.  This is the same as the optimization in
15360   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15361   // we must be careful to do the computation in x87 extended precision, not
15362   // in SSE. (The generic code can't know it's OK to do this, or how to.)
15363   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15364   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15365       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15366       MachineMemOperand::MOLoad, 8, 8);
15367
15368   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15369   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15370   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15371                                          MVT::i64, MMO);
15372
15373   APInt FF(32, 0x5F800000ULL);
15374
15375   // Check whether the sign bit is set.
15376   SDValue SignSet = DAG.getSetCC(
15377       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15378       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15379
15380   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15381   SDValue FudgePtr = DAG.getConstantPool(
15382       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15383
15384   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15385   SDValue Zero = DAG.getIntPtrConstant(0, dl);
15386   SDValue Four = DAG.getIntPtrConstant(4, dl);
15387   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15388   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15389
15390   // Load the value out, extending it from f32 to f80.
15391   // FIXME: Avoid the extend by constructing the right constant pool?
15392   SDValue Fudge = DAG.getExtLoad(
15393       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15394       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15395       /* Alignment = */ 4);
15396   // Extend everything to 80 bits to force it to be done on x87.
15397   // TODO: Are there any fast-math-flags to propagate here?
15398   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15399   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15400                      DAG.getIntPtrConstant(0, dl));
15401 }
15402
15403 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15404 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15405 // just return an <SDValue(), SDValue()> pair.
15406 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15407 // to i16, i32 or i64, and we lower it to a legal sequence.
15408 // If lowered to the final integer result we return a <result, SDValue()> pair.
15409 // Otherwise we lower it to a sequence ending with a FIST, return a
15410 // <FIST, StackSlot> pair, and the caller is responsible for loading
15411 // the final integer result from StackSlot.
15412 std::pair<SDValue,SDValue>
15413 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15414                                    bool IsSigned, bool IsReplace) const {
15415   SDLoc DL(Op);
15416
15417   EVT DstTy = Op.getValueType();
15418   EVT TheVT = Op.getOperand(0).getValueType();
15419   auto PtrVT = getPointerTy(DAG.getDataLayout());
15420
15421   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15422     // f16 must be promoted before using the lowering in this routine.
15423     // fp128 does not use this lowering.
15424     return std::make_pair(SDValue(), SDValue());
15425   }
15426
15427   // If using FIST to compute an unsigned i64, we'll need some fixup
15428   // to handle values above the maximum signed i64.  A FIST is always
15429   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15430   bool UnsignedFixup = !IsSigned &&
15431                        DstTy == MVT::i64 &&
15432                        (!Subtarget.is64Bit() ||
15433                         !isScalarFPTypeInSSEReg(TheVT));
15434
15435   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15436     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15437     // The low 32 bits of the fist result will have the correct uint32 result.
15438     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15439     DstTy = MVT::i64;
15440   }
15441
15442   assert(DstTy.getSimpleVT() <= MVT::i64 &&
15443          DstTy.getSimpleVT() >= MVT::i16 &&
15444          "Unknown FP_TO_INT to lower!");
15445
15446   // These are really Legal.
15447   if (DstTy == MVT::i32 &&
15448       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15449     return std::make_pair(SDValue(), SDValue());
15450   if (Subtarget.is64Bit() &&
15451       DstTy == MVT::i64 &&
15452       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15453     return std::make_pair(SDValue(), SDValue());
15454
15455   // We lower FP->int64 into FISTP64 followed by a load from a temporary
15456   // stack slot.
15457   MachineFunction &MF = DAG.getMachineFunction();
15458   unsigned MemSize = DstTy.getSizeInBits()/8;
15459   int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15460   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15461
15462   unsigned Opc;
15463   switch (DstTy.getSimpleVT().SimpleTy) {
15464   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15465   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15466   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15467   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15468   }
15469
15470   SDValue Chain = DAG.getEntryNode();
15471   SDValue Value = Op.getOperand(0);
15472   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15473
15474   if (UnsignedFixup) {
15475     //
15476     // Conversion to unsigned i64 is implemented with a select,
15477     // depending on whether the source value fits in the range
15478     // of a signed i64.  Let Thresh be the FP equivalent of
15479     // 0x8000000000000000ULL.
15480     //
15481     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15482     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
15483     //  Fist-to-mem64 FistSrc
15484     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15485     //  to XOR'ing the high 32 bits with Adjust.
15486     //
15487     // Being a power of 2, Thresh is exactly representable in all FP formats.
15488     // For X87 we'd like to use the smallest FP type for this constant, but
15489     // for DAG type consistency we have to match the FP operand type.
15490
15491     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15492     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15493     bool LosesInfo = false;
15494     if (TheVT == MVT::f64)
15495       // The rounding mode is irrelevant as the conversion should be exact.
15496       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15497                               &LosesInfo);
15498     else if (TheVT == MVT::f80)
15499       Status = Thresh.convert(APFloat::x87DoubleExtended(),
15500                               APFloat::rmNearestTiesToEven, &LosesInfo);
15501
15502     assert(Status == APFloat::opOK && !LosesInfo &&
15503            "FP conversion should have been exact");
15504
15505     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15506
15507     SDValue Cmp = DAG.getSetCC(DL,
15508                                getSetCCResultType(DAG.getDataLayout(),
15509                                                   *DAG.getContext(), TheVT),
15510                                Value, ThreshVal, ISD::SETLT);
15511     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15512                            DAG.getConstant(0, DL, MVT::i32),
15513                            DAG.getConstant(0x80000000, DL, MVT::i32));
15514     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15515     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15516                                               *DAG.getContext(), TheVT),
15517                        Value, ThreshVal, ISD::SETLT);
15518     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15519   }
15520
15521   // FIXME This causes a redundant load/store if the SSE-class value is already
15522   // in memory, such as if it is on the callstack.
15523   if (isScalarFPTypeInSSEReg(TheVT)) {
15524     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15525     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15526                          MachinePointerInfo::getFixedStack(MF, SSFI));
15527     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15528     SDValue Ops[] = {
15529       Chain, StackSlot, DAG.getValueType(TheVT)
15530     };
15531
15532     MachineMemOperand *MMO =
15533         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15534                                 MachineMemOperand::MOLoad, MemSize, MemSize);
15535     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15536     Chain = Value.getValue(1);
15537     SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15538     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15539   }
15540
15541   MachineMemOperand *MMO =
15542       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15543                               MachineMemOperand::MOStore, MemSize, MemSize);
15544
15545   if (UnsignedFixup) {
15546
15547     // Insert the FIST, load its result as two i32's,
15548     // and XOR the high i32 with Adjust.
15549
15550     SDValue FistOps[] = { Chain, Value, StackSlot };
15551     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15552                                            FistOps, DstTy, MMO);
15553
15554     SDValue Low32 =
15555         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15556     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15557
15558     SDValue High32 =
15559         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15560     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15561
15562     if (Subtarget.is64Bit()) {
15563       // Join High32 and Low32 into a 64-bit result.
15564       // (High32 << 32) | Low32
15565       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15566       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15567       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15568                            DAG.getConstant(32, DL, MVT::i8));
15569       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15570       return std::make_pair(Result, SDValue());
15571     }
15572
15573     SDValue ResultOps[] = { Low32, High32 };
15574
15575     SDValue pair = IsReplace
15576       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15577       : DAG.getMergeValues(ResultOps, DL);
15578     return std::make_pair(pair, SDValue());
15579   } else {
15580     // Build the FP_TO_INT*_IN_MEM
15581     SDValue Ops[] = { Chain, Value, StackSlot };
15582     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15583                                            Ops, DstTy, MMO);
15584     return std::make_pair(FIST, StackSlot);
15585   }
15586 }
15587
15588 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15589                               const X86Subtarget &Subtarget) {
15590   MVT VT = Op->getSimpleValueType(0);
15591   SDValue In = Op->getOperand(0);
15592   MVT InVT = In.getSimpleValueType();
15593   SDLoc dl(Op);
15594
15595   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15596     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15597
15598   // Optimize vectors in AVX mode:
15599   //
15600   //   v8i16 -> v8i32
15601   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
15602   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
15603   //   Concat upper and lower parts.
15604   //
15605   //   v4i32 -> v4i64
15606   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
15607   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
15608   //   Concat upper and lower parts.
15609   //
15610
15611   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15612       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15613       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15614     return SDValue();
15615
15616   if (Subtarget.hasInt256())
15617     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15618
15619   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15620   SDValue Undef = DAG.getUNDEF(InVT);
15621   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15622   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15623   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15624
15625   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15626                              VT.getVectorNumElements()/2);
15627
15628   OpLo = DAG.getBitcast(HVT, OpLo);
15629   OpHi = DAG.getBitcast(HVT, OpHi);
15630
15631   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15632 }
15633
15634 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15635                   const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15636   MVT VT = Op->getSimpleValueType(0);
15637   SDValue In = Op->getOperand(0);
15638   MVT InVT = In.getSimpleValueType();
15639   SDLoc DL(Op);
15640   unsigned NumElts = VT.getVectorNumElements();
15641
15642   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15643       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15644     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15645
15646   if (InVT.getVectorElementType() != MVT::i1)
15647     return SDValue();
15648
15649   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15650   MVT ExtVT = VT;
15651   if (!VT.is512BitVector() && !Subtarget.hasVLX())
15652     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15653
15654   SDValue One =
15655    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15656   SDValue Zero =
15657    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15658
15659   SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15660   if (VT == ExtVT)
15661     return SelectedVal;
15662   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15663 }
15664
15665 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15666                                SelectionDAG &DAG) {
15667   if (Subtarget.hasFp256())
15668     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15669       return Res;
15670
15671   return SDValue();
15672 }
15673
15674 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15675                                 SelectionDAG &DAG) {
15676   SDLoc DL(Op);
15677   MVT VT = Op.getSimpleValueType();
15678   SDValue In = Op.getOperand(0);
15679   MVT SVT = In.getSimpleValueType();
15680
15681   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15682     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15683
15684   if (Subtarget.hasFp256())
15685     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15686       return Res;
15687
15688   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15689          VT.getVectorNumElements() != SVT.getVectorNumElements());
15690   return SDValue();
15691 }
15692
15693 /// Helper to recursively truncate vector elements in half with PACKSS.
15694 /// It makes use of the fact that vector comparison results will be all-zeros
15695 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15696 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15697 /// within each 128-bit lane.
15698 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15699                                                const SDLoc &DL,
15700                                                SelectionDAG &DAG,
15701                                                const X86Subtarget &Subtarget) {
15702   // Requires SSE2 but AVX512 has fast truncate.
15703   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15704     return SDValue();
15705
15706   EVT SrcVT = In.getValueType();
15707
15708   // No truncation required, we might get here due to recursive calls.
15709   if (SrcVT == DstVT)
15710     return In;
15711
15712   // We only support vector truncation to 128bits or greater from a
15713   // 256bits or greater source.
15714   if ((DstVT.getSizeInBits() % 128) != 0)
15715     return SDValue();
15716   if ((SrcVT.getSizeInBits() % 256) != 0)
15717     return SDValue();
15718
15719   unsigned NumElems = SrcVT.getVectorNumElements();
15720   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15721   assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15722
15723   EVT PackedSVT =
15724       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15725
15726   // Extract lower/upper subvectors.
15727   unsigned NumSubElts = NumElems / 2;
15728   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15729   SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15730   SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15731
15732   // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15733   if (SrcVT.is256BitVector()) {
15734     Lo = DAG.getBitcast(MVT::v8i16, Lo);
15735     Hi = DAG.getBitcast(MVT::v8i16, Hi);
15736     SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15737     return DAG.getBitcast(DstVT, Res);
15738   }
15739
15740   // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15741   // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15742   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15743     Lo = DAG.getBitcast(MVT::v16i16, Lo);
15744     Hi = DAG.getBitcast(MVT::v16i16, Hi);
15745     SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15746
15747     // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15748     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15749     Res = DAG.getBitcast(MVT::v4i64, Res);
15750     Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15751
15752     if (DstVT.is256BitVector())
15753       return DAG.getBitcast(DstVT, Res);
15754
15755     // If 512bit -> 128bit truncate another stage.
15756     EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15757     Res = DAG.getBitcast(PackedVT, Res);
15758     return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15759   }
15760
15761   // Recursively pack lower/upper subvectors, concat result and pack again.
15762   assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15763   EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15764   Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15765   Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15766
15767   PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15768   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15769   return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15770 }
15771
15772 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15773                                   const X86Subtarget &Subtarget) {
15774
15775   SDLoc DL(Op);
15776   MVT VT = Op.getSimpleValueType();
15777   SDValue In = Op.getOperand(0);
15778   MVT InVT = In.getSimpleValueType();
15779
15780   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15781
15782   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15783   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15784   if (InVT.getScalarSizeInBits() <= 16) {
15785     if (Subtarget.hasBWI()) {
15786       // legal, will go to VPMOVB2M, VPMOVW2M
15787       // Shift packed bytes not supported natively, bitcast to word
15788       MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15789       SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15790                                        DAG.getBitcast(ExtVT, In),
15791                                        DAG.getConstant(ShiftInx, DL, ExtVT));
15792       ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15793       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15794     }
15795     // Use TESTD/Q, extended vector to packed dword/qword.
15796     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15797            "Unexpected vector type.");
15798     unsigned NumElts = InVT.getVectorNumElements();
15799     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15800     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15801     InVT = ExtVT;
15802     ShiftInx = InVT.getScalarSizeInBits() - 1;
15803   }
15804
15805   SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15806                                    DAG.getConstant(ShiftInx, DL, InVT));
15807   return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15808 }
15809
15810 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15811   SDLoc DL(Op);
15812   MVT VT = Op.getSimpleValueType();
15813   SDValue In = Op.getOperand(0);
15814   MVT InVT = In.getSimpleValueType();
15815
15816   if (VT == MVT::i1) {
15817     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15818            "Invalid scalar TRUNCATE operation");
15819     if (InVT.getSizeInBits() >= 32)
15820       return SDValue();
15821     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15822     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15823   }
15824   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15825          "Invalid TRUNCATE operation");
15826
15827   if (VT.getVectorElementType() == MVT::i1)
15828     return LowerTruncateVecI1(Op, DAG, Subtarget);
15829
15830   // vpmovqb/w/d, vpmovdb/w, vpmovwb
15831   if (Subtarget.hasAVX512()) {
15832     // word to byte only under BWI
15833     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15834       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15835                          getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
15836     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15837   }
15838
15839   // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
15840   if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
15841     if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15842       return V;
15843
15844   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15845     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15846     if (Subtarget.hasInt256()) {
15847       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15848       In = DAG.getBitcast(MVT::v8i32, In);
15849       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
15850       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15851                          DAG.getIntPtrConstant(0, DL));
15852     }
15853
15854     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15855                                DAG.getIntPtrConstant(0, DL));
15856     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15857                                DAG.getIntPtrConstant(2, DL));
15858     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15859     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15860     static const int ShufMask[] = {0, 2, 4, 6};
15861     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15862   }
15863
15864   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15865     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
15866     if (Subtarget.hasInt256()) {
15867       In = DAG.getBitcast(MVT::v32i8, In);
15868
15869       // The PSHUFB mask:
15870       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
15871                                       -1, -1, -1, -1, -1, -1, -1, -1,
15872                                       16, 17, 20, 21, 24, 25, 28, 29,
15873                                       -1, -1, -1, -1, -1, -1, -1, -1 };
15874       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
15875       In = DAG.getBitcast(MVT::v4i64, In);
15876
15877       static const int ShufMask2[] = {0,  2,  -1,  -1};
15878       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
15879       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15880                        DAG.getIntPtrConstant(0, DL));
15881       return DAG.getBitcast(VT, In);
15882     }
15883
15884     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15885                                DAG.getIntPtrConstant(0, DL));
15886
15887     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15888                                DAG.getIntPtrConstant(4, DL));
15889
15890     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15891     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15892
15893     // The PSHUFB mask:
15894     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
15895                                    -1, -1, -1, -1, -1, -1, -1, -1};
15896
15897     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
15898     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
15899
15900     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15901     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15902
15903     // The MOVLHPS Mask:
15904     static const int ShufMask2[] = {0, 1, 4, 5};
15905     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15906     return DAG.getBitcast(MVT::v8i16, res);
15907   }
15908
15909   // Handle truncation of V256 to V128 using shuffles.
15910   if (!VT.is128BitVector() || !InVT.is256BitVector())
15911     return SDValue();
15912
15913   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15914
15915   unsigned NumElems = VT.getVectorNumElements();
15916   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15917
15918   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15919   // Prepare truncation shuffle mask
15920   for (unsigned i = 0; i != NumElems; ++i)
15921     MaskVec[i] = i * 2;
15922   In = DAG.getBitcast(NVT, In);
15923   SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
15924   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15925                      DAG.getIntPtrConstant(0, DL));
15926 }
15927
15928 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
15929   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15930   MVT VT = Op.getSimpleValueType();
15931
15932   if (VT.isVector()) {
15933     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15934     SDValue Src = Op.getOperand(0);
15935     SDLoc dl(Op);
15936     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15937       return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
15938                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15939                                      DAG.getUNDEF(MVT::v2f32)));
15940     }
15941
15942     return SDValue();
15943   }
15944
15945   assert(!VT.isVector());
15946
15947   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15948     IsSigned, /*IsReplace=*/ false);
15949   SDValue FIST = Vals.first, StackSlot = Vals.second;
15950   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15951   if (!FIST.getNode())
15952     return Op;
15953
15954   if (StackSlot.getNode())
15955     // Load the result.
15956     return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15957
15958   // The node is the result.
15959   return FIST;
15960 }
15961
15962 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15963   SDLoc DL(Op);
15964   MVT VT = Op.getSimpleValueType();
15965   SDValue In = Op.getOperand(0);
15966   MVT SVT = In.getSimpleValueType();
15967
15968   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15969
15970   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15971                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15972                                  In, DAG.getUNDEF(SVT)));
15973 }
15974
15975 /// The only differences between FABS and FNEG are the mask and the logic op.
15976 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15977 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15978   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15979          "Wrong opcode for lowering FABS or FNEG.");
15980
15981   bool IsFABS = (Op.getOpcode() == ISD::FABS);
15982
15983   // If this is a FABS and it has an FNEG user, bail out to fold the combination
15984   // into an FNABS. We'll lower the FABS after that if it is still in use.
15985   if (IsFABS)
15986     for (SDNode *User : Op->uses())
15987       if (User->getOpcode() == ISD::FNEG)
15988         return Op;
15989
15990   SDLoc dl(Op);
15991   MVT VT = Op.getSimpleValueType();
15992
15993   bool IsF128 = (VT == MVT::f128);
15994
15995   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15996   // decide if we should generate a 16-byte constant mask when we only need 4 or
15997   // 8 bytes for the scalar case.
15998
15999   MVT LogicVT;
16000   MVT EltVT;
16001
16002   if (VT.isVector()) {
16003     LogicVT = VT;
16004     EltVT = VT.getVectorElementType();
16005   } else if (IsF128) {
16006     // SSE instructions are used for optimized f128 logical operations.
16007     LogicVT = MVT::f128;
16008     EltVT = VT;
16009   } else {
16010     // There are no scalar bitwise logical SSE/AVX instructions, so we
16011     // generate a 16-byte vector constant and logic op even for the scalar case.
16012     // Using a 16-byte mask allows folding the load of the mask with
16013     // the logic op, so it can save (~4 bytes) on code size.
16014     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16015     EltVT = VT;
16016   }
16017
16018   unsigned EltBits = EltVT.getSizeInBits();
16019   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16020   APInt MaskElt =
16021     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16022   const fltSemantics &Sem =
16023       EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16024           (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16025   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16026
16027   SDValue Op0 = Op.getOperand(0);
16028   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16029   unsigned LogicOp =
16030     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16031   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16032
16033   if (VT.isVector() || IsF128)
16034     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16035
16036   // For the scalar case extend to a 128-bit vector, perform the logic op,
16037   // and extract the scalar result back out.
16038   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16039   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16040   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16041                      DAG.getIntPtrConstant(0, dl));
16042 }
16043
16044 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16045   SDValue Mag = Op.getOperand(0);
16046   SDValue Sign = Op.getOperand(1);
16047   SDLoc dl(Op);
16048
16049   // If the sign operand is smaller, extend it first.
16050   MVT VT = Op.getSimpleValueType();
16051   if (Sign.getSimpleValueType().bitsLT(VT))
16052     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16053
16054   // And if it is bigger, shrink it first.
16055   if (Sign.getSimpleValueType().bitsGT(VT))
16056     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16057
16058   // At this point the operands and the result should have the same
16059   // type, and that won't be f80 since that is not custom lowered.
16060   bool IsF128 = (VT == MVT::f128);
16061   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16062           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16063           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16064          "Unexpected type in LowerFCOPYSIGN");
16065
16066   MVT EltVT = VT.getScalarType();
16067   const fltSemantics &Sem =
16068       EltVT == MVT::f64 ? APFloat::IEEEdouble()
16069                         : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16070
16071   // Perform all scalar logic operations as 16-byte vectors because there are no
16072   // scalar FP logic instructions in SSE.
16073   // TODO: This isn't necessary. If we used scalar types, we might avoid some
16074   // unnecessary splats, but we might miss load folding opportunities. Should
16075   // this decision be based on OptimizeForSize?
16076   bool IsFakeVector = !VT.isVector() && !IsF128;
16077   MVT LogicVT = VT;
16078   if (IsFakeVector)
16079     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16080
16081   // The mask constants are automatically splatted for vector types.
16082   unsigned EltSizeInBits = VT.getScalarSizeInBits();
16083   SDValue SignMask = DAG.getConstantFP(
16084       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16085   SDValue MagMask = DAG.getConstantFP(
16086       APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16087
16088   // First, clear all bits but the sign bit from the second operand (sign).
16089   if (IsFakeVector)
16090     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16091   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16092
16093   // Next, clear the sign bit from the first operand (magnitude).
16094   // TODO: If we had general constant folding for FP logic ops, this check
16095   // wouldn't be necessary.
16096   SDValue MagBits;
16097   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16098     APFloat APF = Op0CN->getValueAPF();
16099     APF.clearSign();
16100     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16101   } else {
16102     // If the magnitude operand wasn't a constant, we need to AND out the sign.
16103     if (IsFakeVector)
16104       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16105     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16106   }
16107
16108   // OR the magnitude value with the sign bit.
16109   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16110   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16111                                           DAG.getIntPtrConstant(0, dl));
16112 }
16113
16114 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16115   SDValue N0 = Op.getOperand(0);
16116   SDLoc dl(Op);
16117   MVT VT = Op.getSimpleValueType();
16118
16119   MVT OpVT = N0.getSimpleValueType();
16120   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16121          "Unexpected type for FGETSIGN");
16122
16123   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16124   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16125   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16126   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16127   Res = DAG.getZExtOrTrunc(Res, dl, VT);
16128   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16129   return Res;
16130 }
16131
16132 // Check whether an OR'd tree is PTEST-able.
16133 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16134                                       SelectionDAG &DAG) {
16135   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16136
16137   if (!Subtarget.hasSSE41())
16138     return SDValue();
16139
16140   if (!Op->hasOneUse())
16141     return SDValue();
16142
16143   SDNode *N = Op.getNode();
16144   SDLoc DL(N);
16145
16146   SmallVector<SDValue, 8> Opnds;
16147   DenseMap<SDValue, unsigned> VecInMap;
16148   SmallVector<SDValue, 8> VecIns;
16149   EVT VT = MVT::Other;
16150
16151   // Recognize a special case where a vector is casted into wide integer to
16152   // test all 0s.
16153   Opnds.push_back(N->getOperand(0));
16154   Opnds.push_back(N->getOperand(1));
16155
16156   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16157     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16158     // BFS traverse all OR'd operands.
16159     if (I->getOpcode() == ISD::OR) {
16160       Opnds.push_back(I->getOperand(0));
16161       Opnds.push_back(I->getOperand(1));
16162       // Re-evaluate the number of nodes to be traversed.
16163       e += 2; // 2 more nodes (LHS and RHS) are pushed.
16164       continue;
16165     }
16166
16167     // Quit if a non-EXTRACT_VECTOR_ELT
16168     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16169       return SDValue();
16170
16171     // Quit if without a constant index.
16172     SDValue Idx = I->getOperand(1);
16173     if (!isa<ConstantSDNode>(Idx))
16174       return SDValue();
16175
16176     SDValue ExtractedFromVec = I->getOperand(0);
16177     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16178     if (M == VecInMap.end()) {
16179       VT = ExtractedFromVec.getValueType();
16180       // Quit if not 128/256-bit vector.
16181       if (!VT.is128BitVector() && !VT.is256BitVector())
16182         return SDValue();
16183       // Quit if not the same type.
16184       if (VecInMap.begin() != VecInMap.end() &&
16185           VT != VecInMap.begin()->first.getValueType())
16186         return SDValue();
16187       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16188       VecIns.push_back(ExtractedFromVec);
16189     }
16190     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16191   }
16192
16193   assert((VT.is128BitVector() || VT.is256BitVector()) &&
16194          "Not extracted from 128-/256-bit vector.");
16195
16196   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16197
16198   for (DenseMap<SDValue, unsigned>::const_iterator
16199         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16200     // Quit if not all elements are used.
16201     if (I->second != FullMask)
16202       return SDValue();
16203   }
16204
16205   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16206
16207   // Cast all vectors into TestVT for PTEST.
16208   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16209     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16210
16211   // If more than one full vector is evaluated, OR them first before PTEST.
16212   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16213     // Each iteration will OR 2 nodes and append the result until there is only
16214     // 1 node left, i.e. the final OR'd value of all vectors.
16215     SDValue LHS = VecIns[Slot];
16216     SDValue RHS = VecIns[Slot + 1];
16217     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16218   }
16219
16220   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16221 }
16222
16223 /// \brief return true if \c Op has a use that doesn't just read flags.
16224 static bool hasNonFlagsUse(SDValue Op) {
16225   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16226        ++UI) {
16227     SDNode *User = *UI;
16228     unsigned UOpNo = UI.getOperandNo();
16229     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16230       // Look pass truncate.
16231       UOpNo = User->use_begin().getOperandNo();
16232       User = *User->use_begin();
16233     }
16234
16235     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16236         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16237       return true;
16238   }
16239   return false;
16240 }
16241
16242 // Emit KTEST instruction for bit vectors on AVX-512
16243 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16244                          const X86Subtarget &Subtarget) {
16245   if (Op.getOpcode() == ISD::BITCAST) {
16246     auto hasKTEST = [&](MVT VT) {
16247       unsigned SizeInBits = VT.getSizeInBits();
16248       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16249         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16250     };
16251     SDValue Op0 = Op.getOperand(0);
16252     MVT Op0VT = Op0.getValueType().getSimpleVT();
16253     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16254         hasKTEST(Op0VT))
16255       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16256   }
16257   return SDValue();
16258 }
16259
16260 /// Emit nodes that will be selected as "test Op0,Op0", or something
16261 /// equivalent.
16262 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16263                                     SelectionDAG &DAG) const {
16264   if (Op.getValueType() == MVT::i1) {
16265     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16266     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16267                        DAG.getConstant(0, dl, MVT::i8));
16268   }
16269   // CF and OF aren't always set the way we want. Determine which
16270   // of these we need.
16271   bool NeedCF = false;
16272   bool NeedOF = false;
16273   switch (X86CC) {
16274   default: break;
16275   case X86::COND_A: case X86::COND_AE:
16276   case X86::COND_B: case X86::COND_BE:
16277     NeedCF = true;
16278     break;
16279   case X86::COND_G: case X86::COND_GE:
16280   case X86::COND_L: case X86::COND_LE:
16281   case X86::COND_O: case X86::COND_NO: {
16282     // Check if we really need to set the
16283     // Overflow flag. If NoSignedWrap is present
16284     // that is not actually needed.
16285     switch (Op->getOpcode()) {
16286     case ISD::ADD:
16287     case ISD::SUB:
16288     case ISD::MUL:
16289     case ISD::SHL:
16290       if (Op.getNode()->getFlags().hasNoSignedWrap())
16291         break;
16292       LLVM_FALLTHROUGH;
16293     default:
16294       NeedOF = true;
16295       break;
16296     }
16297     break;
16298   }
16299   }
16300   // See if we can use the EFLAGS value from the operand instead of
16301   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16302   // we prove that the arithmetic won't overflow, we can't use OF or CF.
16303   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16304     // Emit KTEST for bit vectors
16305     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16306       return Node;
16307     // Emit a CMP with 0, which is the TEST pattern.
16308     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16309                        DAG.getConstant(0, dl, Op.getValueType()));
16310   }
16311   unsigned Opcode = 0;
16312   unsigned NumOperands = 0;
16313
16314   // Truncate operations may prevent the merge of the SETCC instruction
16315   // and the arithmetic instruction before it. Attempt to truncate the operands
16316   // of the arithmetic instruction and use a reduced bit-width instruction.
16317   bool NeedTruncation = false;
16318   SDValue ArithOp = Op;
16319   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16320     SDValue Arith = Op->getOperand(0);
16321     // Both the trunc and the arithmetic op need to have one user each.
16322     if (Arith->hasOneUse())
16323       switch (Arith.getOpcode()) {
16324         default: break;
16325         case ISD::ADD:
16326         case ISD::SUB:
16327         case ISD::AND:
16328         case ISD::OR:
16329         case ISD::XOR: {
16330           NeedTruncation = true;
16331           ArithOp = Arith;
16332         }
16333       }
16334   }
16335
16336   // Sometimes flags can be set either with an AND or with an SRL/SHL
16337   // instruction. SRL/SHL variant should be preferred for masks longer than this
16338   // number of bits.
16339   const int ShiftToAndMaxMaskWidth = 32;
16340   const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16341
16342   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16343   // which may be the result of a CAST.  We use the variable 'Op', which is the
16344   // non-casted variable when we check for possible users.
16345   switch (ArithOp.getOpcode()) {
16346   case ISD::ADD:
16347     // Due to an isel shortcoming, be conservative if this add is likely to be
16348     // selected as part of a load-modify-store instruction. When the root node
16349     // in a match is a store, isel doesn't know how to remap non-chain non-flag
16350     // uses of other nodes in the match, such as the ADD in this case. This
16351     // leads to the ADD being left around and reselected, with the result being
16352     // two adds in the output.  Alas, even if none our users are stores, that
16353     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
16354     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
16355     // climbing the DAG back to the root, and it doesn't seem to be worth the
16356     // effort.
16357     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16358          UE = Op.getNode()->use_end(); UI != UE; ++UI)
16359       if (UI->getOpcode() != ISD::CopyToReg &&
16360           UI->getOpcode() != ISD::SETCC &&
16361           UI->getOpcode() != ISD::STORE)
16362         goto default_case;
16363
16364     if (ConstantSDNode *C =
16365         dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16366       // An add of one will be selected as an INC.
16367       if (C->isOne() && !Subtarget.slowIncDec()) {
16368         Opcode = X86ISD::INC;
16369         NumOperands = 1;
16370         break;
16371       }
16372
16373       // An add of negative one (subtract of one) will be selected as a DEC.
16374       if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16375         Opcode = X86ISD::DEC;
16376         NumOperands = 1;
16377         break;
16378       }
16379     }
16380
16381     // Otherwise use a regular EFLAGS-setting add.
16382     Opcode = X86ISD::ADD;
16383     NumOperands = 2;
16384     break;
16385   case ISD::SHL:
16386   case ISD::SRL:
16387     // If we have a constant logical shift that's only used in a comparison
16388     // against zero turn it into an equivalent AND. This allows turning it into
16389     // a TEST instruction later.
16390     if (ZeroCheck && Op->hasOneUse() &&
16391         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16392       EVT VT = Op.getValueType();
16393       unsigned BitWidth = VT.getSizeInBits();
16394       unsigned ShAmt = Op->getConstantOperandVal(1);
16395       if (ShAmt >= BitWidth) // Avoid undefined shifts.
16396         break;
16397       APInt Mask = ArithOp.getOpcode() == ISD::SRL
16398                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16399                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16400       if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16401         break;
16402       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16403                        DAG.getConstant(Mask, dl, VT));
16404     }
16405     break;
16406
16407   case ISD::AND:
16408     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16409     // because a TEST instruction will be better. However, AND should be
16410     // preferred if the instruction can be combined into ANDN.
16411     if (!hasNonFlagsUse(Op)) {
16412       SDValue Op0 = ArithOp->getOperand(0);
16413       SDValue Op1 = ArithOp->getOperand(1);
16414       EVT VT = ArithOp.getValueType();
16415       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16416       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16417       bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16418
16419       // If we cannot select an ANDN instruction, check if we can replace
16420       // AND+IMM64 with a shift before giving up. This is possible for masks
16421       // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16422       if (!isProperAndn) {
16423         if (!ZeroCheck)
16424           break;
16425
16426         assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16427         auto *CN = dyn_cast<ConstantSDNode>(Op1);
16428         if (!CN)
16429           break;
16430
16431         const APInt &Mask = CN->getAPIntValue();
16432         if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16433           break; // Prefer TEST instruction.
16434
16435         unsigned BitWidth = Mask.getBitWidth();
16436         unsigned LeadingOnes = Mask.countLeadingOnes();
16437         unsigned TrailingZeros = Mask.countTrailingZeros();
16438
16439         if (LeadingOnes + TrailingZeros == BitWidth) {
16440           assert(TrailingZeros < VT.getSizeInBits() &&
16441                  "Shift amount should be less than the type width");
16442           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16443           SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16444           Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16445           break;
16446         }
16447
16448         unsigned LeadingZeros = Mask.countLeadingZeros();
16449         unsigned TrailingOnes = Mask.countTrailingOnes();
16450
16451         if (LeadingZeros + TrailingOnes == BitWidth) {
16452           assert(LeadingZeros < VT.getSizeInBits() &&
16453                  "Shift amount should be less than the type width");
16454           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16455           SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16456           Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16457           break;
16458         }
16459
16460         break;
16461       }
16462     }
16463     LLVM_FALLTHROUGH;
16464   case ISD::SUB:
16465   case ISD::OR:
16466   case ISD::XOR:
16467     // Due to the ISEL shortcoming noted above, be conservative if this op is
16468     // likely to be selected as part of a load-modify-store instruction.
16469     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16470            UE = Op.getNode()->use_end(); UI != UE; ++UI)
16471       if (UI->getOpcode() == ISD::STORE)
16472         goto default_case;
16473
16474     // Otherwise use a regular EFLAGS-setting instruction.
16475     switch (ArithOp.getOpcode()) {
16476     default: llvm_unreachable("unexpected operator!");
16477     case ISD::SUB: Opcode = X86ISD::SUB; break;
16478     case ISD::XOR: Opcode = X86ISD::XOR; break;
16479     case ISD::AND: Opcode = X86ISD::AND; break;
16480     case ISD::OR: {
16481       if (!NeedTruncation && ZeroCheck) {
16482         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16483           return EFLAGS;
16484       }
16485       Opcode = X86ISD::OR;
16486       break;
16487     }
16488     }
16489
16490     NumOperands = 2;
16491     break;
16492   case X86ISD::ADD:
16493   case X86ISD::SUB:
16494   case X86ISD::INC:
16495   case X86ISD::DEC:
16496   case X86ISD::OR:
16497   case X86ISD::XOR:
16498   case X86ISD::AND:
16499     return SDValue(Op.getNode(), 1);
16500   default:
16501   default_case:
16502     break;
16503   }
16504
16505   // If we found that truncation is beneficial, perform the truncation and
16506   // update 'Op'.
16507   if (NeedTruncation) {
16508     EVT VT = Op.getValueType();
16509     SDValue WideVal = Op->getOperand(0);
16510     EVT WideVT = WideVal.getValueType();
16511     unsigned ConvertedOp = 0;
16512     // Use a target machine opcode to prevent further DAGCombine
16513     // optimizations that may separate the arithmetic operations
16514     // from the setcc node.
16515     switch (WideVal.getOpcode()) {
16516       default: break;
16517       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16518       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16519       case ISD::AND: ConvertedOp = X86ISD::AND; break;
16520       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
16521       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16522     }
16523
16524     if (ConvertedOp) {
16525       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16526       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16527         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16528         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16529         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16530       }
16531     }
16532   }
16533
16534   if (Opcode == 0) {
16535     // Emit KTEST for bit vectors
16536     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16537       return Node;
16538
16539     // Emit a CMP with 0, which is the TEST pattern.
16540     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16541                        DAG.getConstant(0, dl, Op.getValueType()));
16542   }
16543   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16544   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16545
16546   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16547   DAG.ReplaceAllUsesWith(Op, New);
16548   return SDValue(New.getNode(), 1);
16549 }
16550
16551 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16552 /// equivalent.
16553 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16554                                    const SDLoc &dl, SelectionDAG &DAG) const {
16555   if (isNullConstant(Op1))
16556     return EmitTest(Op0, X86CC, dl, DAG);
16557
16558   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16559          "Unexpected comparison operation for MVT::i1 operands");
16560
16561   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16562        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16563     // Only promote the compare up to I32 if it is a 16 bit operation
16564     // with an immediate.  16 bit immediates are to be avoided.
16565     if ((Op0.getValueType() == MVT::i16 &&
16566          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16567         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16568         !Subtarget.isAtom()) {
16569       unsigned ExtendOp =
16570           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16571       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16572       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16573     }
16574     // Use SUB instead of CMP to enable CSE between SUB and CMP.
16575     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16576     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16577                               Op0, Op1);
16578     return SDValue(Sub.getNode(), 1);
16579   }
16580   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16581 }
16582
16583 /// Convert a comparison if required by the subtarget.
16584 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16585                                                  SelectionDAG &DAG) const {
16586   // If the subtarget does not support the FUCOMI instruction, floating-point
16587   // comparisons have to be converted.
16588   if (Subtarget.hasCMov() ||
16589       Cmp.getOpcode() != X86ISD::CMP ||
16590       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16591       !Cmp.getOperand(1).getValueType().isFloatingPoint())
16592     return Cmp;
16593
16594   // The instruction selector will select an FUCOM instruction instead of
16595   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16596   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16597   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16598   SDLoc dl(Cmp);
16599   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16600   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16601   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16602                             DAG.getConstant(8, dl, MVT::i8));
16603   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16604
16605   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16606   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16607   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16608 }
16609
16610 /// Check if replacement of SQRT with RSQRT should be disabled.
16611 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16612   EVT VT = Op.getValueType();
16613
16614   // We never want to use both SQRT and RSQRT instructions for the same input.
16615   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16616     return false;
16617
16618   if (VT.isVector())
16619     return Subtarget.hasFastVectorFSQRT();
16620   return Subtarget.hasFastScalarFSQRT();
16621 }
16622
16623 /// The minimum architected relative accuracy is 2^-12. We need one
16624 /// Newton-Raphson step to have a good float result (24 bits of precision).
16625 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16626                                            SelectionDAG &DAG, int Enabled,
16627                                            int &RefinementSteps,
16628                                            bool &UseOneConstNR,
16629                                            bool Reciprocal) const {
16630   EVT VT = Op.getValueType();
16631
16632   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16633   // TODO: Add support for AVX512 (v16f32).
16634   // It is likely not profitable to do this for f64 because a double-precision
16635   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16636   // instructions: convert to single, rsqrtss, convert back to double, refine
16637   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16638   // along with FMA, this could be a throughput win.
16639   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16640       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16641       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16642     if (RefinementSteps == ReciprocalEstimate::Unspecified)
16643       RefinementSteps = 1;
16644
16645     UseOneConstNR = false;
16646     return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16647   }
16648   return SDValue();
16649 }
16650
16651 /// The minimum architected relative accuracy is 2^-12. We need one
16652 /// Newton-Raphson step to have a good float result (24 bits of precision).
16653 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16654                                             int Enabled,
16655                                             int &RefinementSteps) const {
16656   EVT VT = Op.getValueType();
16657
16658   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16659   // TODO: Add support for AVX512 (v16f32).
16660   // It is likely not profitable to do this for f64 because a double-precision
16661   // reciprocal estimate with refinement on x86 prior to FMA requires
16662   // 15 instructions: convert to single, rcpss, convert back to double, refine
16663   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16664   // along with FMA, this could be a throughput win.
16665
16666   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16667       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16668       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16669     // Enable estimate codegen with 1 refinement step for vector division.
16670     // Scalar division estimates are disabled because they break too much
16671     // real-world code. These defaults are intended to match GCC behavior.
16672     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16673       return SDValue();
16674
16675     if (RefinementSteps == ReciprocalEstimate::Unspecified)
16676       RefinementSteps = 1;
16677
16678     return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16679   }
16680   return SDValue();
16681 }
16682
16683 /// If we have at least two divisions that use the same divisor, convert to
16684 /// multiplication by a reciprocal. This may need to be adjusted for a given
16685 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16686 /// This is because we still need one division to calculate the reciprocal and
16687 /// then we need two multiplies by that reciprocal as replacements for the
16688 /// original divisions.
16689 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16690   return 2;
16691 }
16692
16693 /// Helper for creating a X86ISD::SETCC node.
16694 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16695                         SelectionDAG &DAG) {
16696   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16697                      DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16698 }
16699
16700 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16701 /// according to equal/not-equal condition code \p CC.
16702 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16703                                    const SDLoc &dl, SelectionDAG &DAG) {
16704   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
16705   // instruction.  Since the shift amount is in-range-or-undefined, we know
16706   // that doing a bittest on the i32 value is ok.  We extend to i32 because
16707   // the encoding for the i16 version is larger than the i32 version.
16708   // Also promote i16 to i32 for performance / code size reason.
16709   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16710     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16711
16712   // See if we can use the 32-bit instruction instead of the 64-bit one for a
16713   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16714   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16715   // known to be zero.
16716   if (Src.getValueType() == MVT::i64 &&
16717       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16718     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16719
16720   // If the operand types disagree, extend the shift amount to match.  Since
16721   // BT ignores high bits (like shifts) we can use anyextend.
16722   if (Src.getValueType() != BitNo.getValueType())
16723     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16724
16725   SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16726   X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16727   return getSETCC(Cond, BT, dl , DAG);
16728 }
16729
16730 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16731 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16732                             const SDLoc &dl, SelectionDAG &DAG) {
16733   SDValue Op0 = And.getOperand(0);
16734   SDValue Op1 = And.getOperand(1);
16735   if (Op0.getOpcode() == ISD::TRUNCATE)
16736     Op0 = Op0.getOperand(0);
16737   if (Op1.getOpcode() == ISD::TRUNCATE)
16738     Op1 = Op1.getOperand(0);
16739
16740   SDValue LHS, RHS;
16741   if (Op1.getOpcode() == ISD::SHL)
16742     std::swap(Op0, Op1);
16743   if (Op0.getOpcode() == ISD::SHL) {
16744     if (isOneConstant(Op0.getOperand(0))) {
16745       // If we looked past a truncate, check that it's only truncating away
16746       // known zeros.
16747       unsigned BitWidth = Op0.getValueSizeInBits();
16748       unsigned AndBitWidth = And.getValueSizeInBits();
16749       if (BitWidth > AndBitWidth) {
16750         KnownBits Known;
16751         DAG.computeKnownBits(Op0, Known);
16752         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16753           return SDValue();
16754       }
16755       LHS = Op1;
16756       RHS = Op0.getOperand(1);
16757     }
16758   } else if (Op1.getOpcode() == ISD::Constant) {
16759     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16760     uint64_t AndRHSVal = AndRHS->getZExtValue();
16761     SDValue AndLHS = Op0;
16762
16763     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16764       LHS = AndLHS.getOperand(0);
16765       RHS = AndLHS.getOperand(1);
16766     }
16767
16768     // Use BT if the immediate can't be encoded in a TEST instruction.
16769     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16770       LHS = AndLHS;
16771       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16772     }
16773   }
16774
16775   if (LHS.getNode())
16776     return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16777
16778   return SDValue();
16779 }
16780
16781 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16782 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16783                                  const SDLoc &dl, SelectionDAG &DAG) {
16784
16785   assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16786          "Expected TRUNCATE to i1 node");
16787
16788   if (Op.getOperand(0).getOpcode() != ISD::SRL)
16789     return SDValue();
16790
16791   SDValue ShiftRight = Op.getOperand(0);
16792   return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16793                              CC, dl, DAG);
16794 }
16795
16796 /// Result of 'and' or 'trunc to i1' is compared against zero.
16797 /// Change to a BT node if possible.
16798 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16799                                      const SDLoc &dl, SelectionDAG &DAG) const {
16800   if (Op.getOpcode() == ISD::AND)
16801     return LowerAndToBT(Op, CC, dl, DAG);
16802   if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16803     return LowerTruncateToBT(Op, CC, dl, DAG);
16804   return SDValue();
16805 }
16806
16807 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16808 /// CMPs.
16809 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16810                               SDValue &Op1) {
16811   unsigned SSECC;
16812   bool Swap = false;
16813
16814   // SSE Condition code mapping:
16815   //  0 - EQ
16816   //  1 - LT
16817   //  2 - LE
16818   //  3 - UNORD
16819   //  4 - NEQ
16820   //  5 - NLT
16821   //  6 - NLE
16822   //  7 - ORD
16823   switch (SetCCOpcode) {
16824   default: llvm_unreachable("Unexpected SETCC condition");
16825   case ISD::SETOEQ:
16826   case ISD::SETEQ:  SSECC = 0; break;
16827   case ISD::SETOGT:
16828   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
16829   case ISD::SETLT:
16830   case ISD::SETOLT: SSECC = 1; break;
16831   case ISD::SETOGE:
16832   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
16833   case ISD::SETLE:
16834   case ISD::SETOLE: SSECC = 2; break;
16835   case ISD::SETUO:  SSECC = 3; break;
16836   case ISD::SETUNE:
16837   case ISD::SETNE:  SSECC = 4; break;
16838   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16839   case ISD::SETUGE: SSECC = 5; break;
16840   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16841   case ISD::SETUGT: SSECC = 6; break;
16842   case ISD::SETO:   SSECC = 7; break;
16843   case ISD::SETUEQ:
16844   case ISD::SETONE: SSECC = 8; break;
16845   }
16846   if (Swap)
16847     std::swap(Op0, Op1);
16848
16849   return SSECC;
16850 }
16851
16852 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16853 /// concatenate the result back.
16854 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16855   MVT VT = Op.getSimpleValueType();
16856
16857   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16858          "Unsupported value type for operation");
16859
16860   unsigned NumElems = VT.getVectorNumElements();
16861   SDLoc dl(Op);
16862   SDValue CC = Op.getOperand(2);
16863
16864   // Extract the LHS vectors
16865   SDValue LHS = Op.getOperand(0);
16866   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16867   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16868
16869   // Extract the RHS vectors
16870   SDValue RHS = Op.getOperand(1);
16871   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16872   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16873
16874   // Issue the operation on the smaller types and concatenate the result back
16875   MVT EltVT = VT.getVectorElementType();
16876   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16877   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16878                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16879                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16880 }
16881
16882 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16883   SDValue Op0 = Op.getOperand(0);
16884   SDValue Op1 = Op.getOperand(1);
16885   SDValue CC = Op.getOperand(2);
16886   MVT VT = Op.getSimpleValueType();
16887   SDLoc dl(Op);
16888
16889   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16890          "Unexpected type for boolean compare operation");
16891   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16892   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16893                                DAG.getConstant(-1, dl, VT));
16894   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16895                                DAG.getConstant(-1, dl, VT));
16896   switch (SetCCOpcode) {
16897   default: llvm_unreachable("Unexpected SETCC condition");
16898   case ISD::SETEQ:
16899     // (x == y) -> ~(x ^ y)
16900     return DAG.getNode(ISD::XOR, dl, VT,
16901                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16902                        DAG.getConstant(-1, dl, VT));
16903   case ISD::SETNE:
16904     // (x != y) -> (x ^ y)
16905     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16906   case ISD::SETUGT:
16907   case ISD::SETGT:
16908     // (x > y) -> (x & ~y)
16909     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16910   case ISD::SETULT:
16911   case ISD::SETLT:
16912     // (x < y) -> (~x & y)
16913     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16914   case ISD::SETULE:
16915   case ISD::SETLE:
16916     // (x <= y) -> (~x | y)
16917     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16918   case ISD::SETUGE:
16919   case ISD::SETGE:
16920     // (x >=y) -> (x | ~y)
16921     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16922   }
16923 }
16924
16925 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16926
16927   SDValue Op0 = Op.getOperand(0);
16928   SDValue Op1 = Op.getOperand(1);
16929   SDValue CC = Op.getOperand(2);
16930   MVT VT = Op.getSimpleValueType();
16931   SDLoc dl(Op);
16932
16933   assert(VT.getVectorElementType() == MVT::i1 &&
16934          "Cannot set masked compare for this operation");
16935
16936   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16937   unsigned  Opc = 0;
16938   bool Unsigned = false;
16939   bool Swap = false;
16940   unsigned SSECC;
16941   switch (SetCCOpcode) {
16942   default: llvm_unreachable("Unexpected SETCC condition");
16943   case ISD::SETNE:  SSECC = 4; break;
16944   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
16945   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16946   case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
16947   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
16948   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16949   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16950   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
16951   case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16952   case ISD::SETLE:  SSECC = 2; break;
16953   }
16954
16955   if (Swap)
16956     std::swap(Op0, Op1);
16957   if (Opc)
16958     return DAG.getNode(Opc, dl, VT, Op0, Op1);
16959   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16960   return DAG.getNode(Opc, dl, VT, Op0, Op1,
16961                      DAG.getConstant(SSECC, dl, MVT::i8));
16962 }
16963
16964 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16965 /// operand \p Op1.  If non-trivial (for example because it's not constant)
16966 /// return an empty value.
16967 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16968                                       SelectionDAG &DAG) {
16969   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16970   if (!BV)
16971     return SDValue();
16972
16973   MVT VT = Op1.getSimpleValueType();
16974   MVT EVT = VT.getVectorElementType();
16975   unsigned n = VT.getVectorNumElements();
16976   SmallVector<SDValue, 8> ULTOp1;
16977
16978   for (unsigned i = 0; i < n; ++i) {
16979     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16980     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16981       return SDValue();
16982
16983     // Avoid underflow.
16984     APInt Val = Elt->getAPIntValue();
16985     if (Val == 0)
16986       return SDValue();
16987
16988     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16989   }
16990
16991   return DAG.getBuildVector(VT, dl, ULTOp1);
16992 }
16993
16994 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16995                            SelectionDAG &DAG) {
16996   SDValue Op0 = Op.getOperand(0);
16997   SDValue Op1 = Op.getOperand(1);
16998   SDValue CC = Op.getOperand(2);
16999   MVT VT = Op.getSimpleValueType();
17000   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17001   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17002   SDLoc dl(Op);
17003
17004   if (isFP) {
17005 #ifndef NDEBUG
17006     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17007     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17008 #endif
17009
17010     unsigned Opc;
17011     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17012       assert(VT.getVectorNumElements() <= 16);
17013       Opc = X86ISD::CMPM;
17014     } else {
17015       Opc = X86ISD::CMPP;
17016       // The SSE/AVX packed FP comparison nodes are defined with a
17017       // floating-point vector result that matches the operand type. This allows
17018       // them to work with an SSE1 target (integer vector types are not legal).
17019       VT = Op0.getSimpleValueType();
17020     }
17021
17022     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17023     // emit two comparisons and a logic op to tie them together.
17024     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17025     // available.
17026     SDValue Cmp;
17027     unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
17028     if (SSECC == 8) {
17029       // LLVM predicate is SETUEQ or SETONE.
17030       unsigned CC0, CC1;
17031       unsigned CombineOpc;
17032       if (SetCCOpcode == ISD::SETUEQ) {
17033         CC0 = 3; // UNORD
17034         CC1 = 0; // EQ
17035         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17036                                            static_cast<unsigned>(ISD::OR);
17037       } else {
17038         assert(SetCCOpcode == ISD::SETONE);
17039         CC0 = 7; // ORD
17040         CC1 = 4; // NEQ
17041         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17042                                            static_cast<unsigned>(ISD::AND);
17043       }
17044
17045       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17046                                  DAG.getConstant(CC0, dl, MVT::i8));
17047       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17048                                  DAG.getConstant(CC1, dl, MVT::i8));
17049       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17050     } else {
17051       // Handle all other FP comparisons here.
17052       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17053                         DAG.getConstant(SSECC, dl, MVT::i8));
17054     }
17055
17056     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17057     // result type of SETCC. The bitcast is expected to be optimized away
17058     // during combining/isel.
17059     if (Opc == X86ISD::CMPP)
17060       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17061
17062     return Cmp;
17063   }
17064
17065   MVT VTOp0 = Op0.getSimpleValueType();
17066   assert(VTOp0 == Op1.getSimpleValueType() &&
17067          "Expected operands with same type!");
17068   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17069          "Invalid number of packed elements for source and destination!");
17070
17071   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17072     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17073     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
17074     // legalizer firstly checks if the first operand in input to the setcc has
17075     // a legal type. If so, then it promotes the return type to that same type.
17076     // Otherwise, the return type is promoted to the 'next legal type' which,
17077     // for a vector of MVT::i1 is always a 128-bit integer vector type.
17078     //
17079     // We reach this code only if the following two conditions are met:
17080     // 1. Both return type and operand type have been promoted to wider types
17081     //    by the type legalizer.
17082     // 2. The original operand type has been promoted to a 256-bit vector.
17083     //
17084     // Note that condition 2. only applies for AVX targets.
17085     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
17086     return DAG.getZExtOrTrunc(NewOp, dl, VT);
17087   }
17088
17089   // The non-AVX512 code below works under the assumption that source and
17090   // destination types are the same.
17091   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17092          "Value types for source and destination must be the same!");
17093
17094   // Break 256-bit integer vector compare into smaller ones.
17095   if (VT.is256BitVector() && !Subtarget.hasInt256())
17096     return Lower256IntVSETCC(Op, DAG);
17097
17098   // Operands are boolean (vectors of i1)
17099   MVT OpVT = Op1.getSimpleValueType();
17100   if (OpVT.getVectorElementType() == MVT::i1)
17101     return LowerBoolVSETCC_AVX512(Op, DAG);
17102
17103   // The result is boolean, but operands are int/float
17104   if (VT.getVectorElementType() == MVT::i1) {
17105     // In AVX-512 architecture setcc returns mask with i1 elements,
17106     // But there is no compare instruction for i8 and i16 elements in KNL.
17107     // In this case use SSE compare
17108     bool UseAVX512Inst =
17109       (OpVT.is512BitVector() ||
17110        OpVT.getScalarSizeInBits() >= 32 ||
17111        (Subtarget.hasBWI() && Subtarget.hasVLX()));
17112
17113     if (UseAVX512Inst)
17114       return LowerIntVSETCC_AVX512(Op, DAG);
17115
17116     return DAG.getNode(ISD::TRUNCATE, dl, VT,
17117                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17118   }
17119
17120   // Lower using XOP integer comparisons.
17121   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17122        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17123     // Translate compare code to XOP PCOM compare mode.
17124     unsigned CmpMode = 0;
17125     switch (SetCCOpcode) {
17126     default: llvm_unreachable("Unexpected SETCC condition");
17127     case ISD::SETULT:
17128     case ISD::SETLT: CmpMode = 0x00; break;
17129     case ISD::SETULE:
17130     case ISD::SETLE: CmpMode = 0x01; break;
17131     case ISD::SETUGT:
17132     case ISD::SETGT: CmpMode = 0x02; break;
17133     case ISD::SETUGE:
17134     case ISD::SETGE: CmpMode = 0x03; break;
17135     case ISD::SETEQ: CmpMode = 0x04; break;
17136     case ISD::SETNE: CmpMode = 0x05; break;
17137     }
17138
17139     // Are we comparing unsigned or signed integers?
17140     unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
17141       ? X86ISD::VPCOMU : X86ISD::VPCOM;
17142
17143     return DAG.getNode(Opc, dl, VT, Op0, Op1,
17144                        DAG.getConstant(CmpMode, dl, MVT::i8));
17145   }
17146
17147   // We are handling one of the integer comparisons here.  Since SSE only has
17148   // GT and EQ comparisons for integer, swapping operands and multiple
17149   // operations may be required for some comparisons.
17150   unsigned Opc;
17151   bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
17152   bool Subus = false;
17153
17154   switch (SetCCOpcode) {
17155   default: llvm_unreachable("Unexpected SETCC condition");
17156   case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
17157   case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
17158   case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
17159   case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
17160   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
17161   case ISD::SETLE:  Opc = X86ISD::PCMPGT;
17162                     Invert = true; break;
17163   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17164   case ISD::SETUGT: Opc = X86ISD::PCMPGT;
17165                     FlipSigns = true; break;
17166   case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
17167   case ISD::SETULE: Opc = X86ISD::PCMPGT;
17168                     FlipSigns = true; Invert = true; break;
17169   }
17170
17171   // Special case: Use min/max operations for SETULE/SETUGE
17172   MVT VET = VT.getVectorElementType();
17173   bool hasMinMax =
17174        (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
17175     || (Subtarget.hasSSE2()  && (VET == MVT::i8));
17176
17177   if (hasMinMax) {
17178     switch (SetCCOpcode) {
17179     default: break;
17180     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17181     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17182     }
17183
17184     if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
17185   }
17186
17187   bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17188   if (!MinMax && hasSubus) {
17189     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17190     // Op0 u<= Op1:
17191     //   t = psubus Op0, Op1
17192     //   pcmpeq t, <0..0>
17193     switch (SetCCOpcode) {
17194     default: break;
17195     case ISD::SETULT: {
17196       // If the comparison is against a constant we can turn this into a
17197       // setule.  With psubus, setule does not require a swap.  This is
17198       // beneficial because the constant in the register is no longer
17199       // destructed as the destination so it can be hoisted out of a loop.
17200       // Only do this pre-AVX since vpcmp* is no longer destructive.
17201       if (Subtarget.hasAVX())
17202         break;
17203       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17204         Op1 = ULEOp1;
17205         Subus = true; Invert = false; Swap = false;
17206       }
17207       break;
17208     }
17209     // Psubus is better than flip-sign because it requires no inversion.
17210     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
17211     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17212     }
17213
17214     if (Subus) {
17215       Opc = X86ISD::SUBUS;
17216       FlipSigns = false;
17217     }
17218   }
17219
17220   if (Swap)
17221     std::swap(Op0, Op1);
17222
17223   // Check that the operation in question is available (most are plain SSE2,
17224   // but PCMPGTQ and PCMPEQQ have different requirements).
17225   if (VT == MVT::v2i64) {
17226     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17227       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17228
17229       // First cast everything to the right type.
17230       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17231       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17232
17233       // Since SSE has no unsigned integer comparisons, we need to flip the sign
17234       // bits of the inputs before performing those operations. The lower
17235       // compare is always unsigned.
17236       SDValue SB;
17237       if (FlipSigns) {
17238         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17239       } else {
17240         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17241         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17242         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17243       }
17244       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17245       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17246
17247       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17248       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17249       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17250
17251       // Create masks for only the low parts/high parts of the 64 bit integers.
17252       static const int MaskHi[] = { 1, 1, 3, 3 };
17253       static const int MaskLo[] = { 0, 0, 2, 2 };
17254       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17255       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17256       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17257
17258       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17259       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17260
17261       if (Invert)
17262         Result = DAG.getNOT(dl, Result, MVT::v4i32);
17263
17264       return DAG.getBitcast(VT, Result);
17265     }
17266
17267     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17268       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17269       // pcmpeqd + pshufd + pand.
17270       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17271
17272       // First cast everything to the right type.
17273       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17274       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17275
17276       // Do the compare.
17277       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17278
17279       // Make sure the lower and upper halves are both all-ones.
17280       static const int Mask[] = { 1, 0, 3, 2 };
17281       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17282       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17283
17284       if (Invert)
17285         Result = DAG.getNOT(dl, Result, MVT::v4i32);
17286
17287       return DAG.getBitcast(VT, Result);
17288     }
17289   }
17290
17291   // Since SSE has no unsigned integer comparisons, we need to flip the sign
17292   // bits of the inputs before performing those operations.
17293   if (FlipSigns) {
17294     MVT EltVT = VT.getVectorElementType();
17295     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17296                                  VT);
17297     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17298     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17299   }
17300
17301   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17302
17303   // If the logical-not of the result is required, perform that now.
17304   if (Invert)
17305     Result = DAG.getNOT(dl, Result, VT);
17306
17307   if (MinMax)
17308     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17309
17310   if (Subus)
17311     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17312                          getZeroVector(VT, Subtarget, DAG, dl));
17313
17314   return Result;
17315 }
17316
17317 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17318
17319   MVT VT = Op.getSimpleValueType();
17320
17321   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17322
17323   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17324   SDValue Op0 = Op.getOperand(0);
17325   SDValue Op1 = Op.getOperand(1);
17326   SDLoc dl(Op);
17327   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17328
17329   // Optimize to BT if possible.
17330   // Lower (X & (1 << N)) == 0 to BT(X, N).
17331   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17332   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17333   // Lower (trunc (X >> N) to i1) to BT(X, N).
17334   if (Op0.hasOneUse() && isNullConstant(Op1) &&
17335       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17336     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17337       if (VT == MVT::i1)
17338         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17339       return NewSetCC;
17340     }
17341   }
17342
17343   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
17344   // these.
17345   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17346       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17347
17348     // If the input is a setcc, then reuse the input setcc or use a new one with
17349     // the inverted condition.
17350     if (Op0.getOpcode() == X86ISD::SETCC) {
17351       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17352       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17353       if (!Invert)
17354         return Op0;
17355
17356       CCode = X86::GetOppositeBranchCondition(CCode);
17357       SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17358       if (VT == MVT::i1)
17359         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17360       return SetCC;
17361     }
17362   }
17363   if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17364     if (isOneConstant(Op1)) {
17365       ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17366       return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17367     }
17368     if (!isNullConstant(Op1)) {
17369       SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17370       return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17371     }
17372   }
17373
17374   bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17375   X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17376   if (X86CC == X86::COND_INVALID)
17377     return SDValue();
17378
17379   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17380   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17381   SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17382   if (VT == MVT::i1)
17383     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17384   return SetCC;
17385 }
17386
17387 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17388   SDValue LHS = Op.getOperand(0);
17389   SDValue RHS = Op.getOperand(1);
17390   SDValue Carry = Op.getOperand(2);
17391   SDValue Cond = Op.getOperand(3);
17392   SDLoc DL(Op);
17393
17394   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17395   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17396
17397   // Recreate the carry if needed.
17398   EVT CarryVT = Carry.getValueType();
17399   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17400   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17401                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
17402
17403   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17404   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17405   SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17406   if (Op.getSimpleValueType() == MVT::i1)
17407     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17408   return SetCC;
17409 }
17410
17411 /// Return true if opcode is a X86 logical comparison.
17412 static bool isX86LogicalCmp(SDValue Op) {
17413   unsigned Opc = Op.getOpcode();
17414   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17415       Opc == X86ISD::SAHF)
17416     return true;
17417   if (Op.getResNo() == 1 &&
17418       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17419        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17420        Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17421        Opc == X86ISD::XOR || Opc == X86ISD::AND))
17422     return true;
17423
17424   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17425     return true;
17426
17427   return false;
17428 }
17429
17430 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17431   if (V.getOpcode() != ISD::TRUNCATE)
17432     return false;
17433
17434   SDValue VOp0 = V.getOperand(0);
17435   unsigned InBits = VOp0.getValueSizeInBits();
17436   unsigned Bits = V.getValueSizeInBits();
17437   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17438 }
17439
17440 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17441   bool AddTest = true;
17442   SDValue Cond  = Op.getOperand(0);
17443   SDValue Op1 = Op.getOperand(1);
17444   SDValue Op2 = Op.getOperand(2);
17445   SDLoc DL(Op);
17446   MVT VT = Op1.getSimpleValueType();
17447   SDValue CC;
17448
17449   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17450   // are available or VBLENDV if AVX is available.
17451   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17452   if (Cond.getOpcode() == ISD::SETCC &&
17453       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17454        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17455       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17456     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17457     int SSECC = translateX86FSETCC(
17458         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17459
17460     if (SSECC != 8) {
17461       if (Subtarget.hasAVX512()) {
17462         SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17463                                   CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17464         return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17465                            DL, VT, Cmp, Op1, Op2);
17466       }
17467
17468       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17469                                 DAG.getConstant(SSECC, DL, MVT::i8));
17470
17471       // If we have AVX, we can use a variable vector select (VBLENDV) instead
17472       // of 3 logic instructions for size savings and potentially speed.
17473       // Unfortunately, there is no scalar form of VBLENDV.
17474
17475       // If either operand is a constant, don't try this. We can expect to
17476       // optimize away at least one of the logic instructions later in that
17477       // case, so that sequence would be faster than a variable blend.
17478
17479       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17480       // uses XMM0 as the selection register. That may need just as many
17481       // instructions as the AND/ANDN/OR sequence due to register moves, so
17482       // don't bother.
17483
17484       if (Subtarget.hasAVX() &&
17485           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17486
17487         // Convert to vectors, do a VSELECT, and convert back to scalar.
17488         // All of the conversions should be optimized away.
17489
17490         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17491         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17492         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17493         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17494
17495         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17496         VCmp = DAG.getBitcast(VCmpVT, VCmp);
17497
17498         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17499
17500         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17501                            VSel, DAG.getIntPtrConstant(0, DL));
17502       }
17503       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17504       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17505       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17506     }
17507   }
17508
17509   // AVX512 fallback is to lower selects of scalar floats to masked moves.
17510   if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17511     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17512     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17513   }
17514
17515   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17516     SDValue Op1Scalar;
17517     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17518       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17519     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17520       Op1Scalar = Op1.getOperand(0);
17521     SDValue Op2Scalar;
17522     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17523       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17524     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17525       Op2Scalar = Op2.getOperand(0);
17526     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17527       SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17528                                         Op1Scalar, Op2Scalar);
17529       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17530         return DAG.getBitcast(VT, newSelect);
17531       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17532       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17533                          DAG.getIntPtrConstant(0, DL));
17534     }
17535   }
17536
17537   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17538     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17539     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17540                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17541     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17542                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17543     SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17544     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17545   }
17546
17547   if (Cond.getOpcode() == ISD::SETCC) {
17548     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17549       Cond = NewCond;
17550       // If the condition was updated, it's possible that the operands of the
17551       // select were also updated (for example, EmitTest has a RAUW). Refresh
17552       // the local references to the select operands in case they got stale.
17553       Op1 = Op.getOperand(1);
17554       Op2 = Op.getOperand(2);
17555     }
17556   }
17557
17558   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17559   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17560   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17561   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17562   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17563   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17564   if (Cond.getOpcode() == X86ISD::SETCC &&
17565       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17566       isNullConstant(Cond.getOperand(1).getOperand(1))) {
17567     SDValue Cmp = Cond.getOperand(1);
17568     unsigned CondCode =
17569         cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17570
17571     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17572         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17573       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17574
17575       SDValue CmpOp0 = Cmp.getOperand(0);
17576       // Apply further optimizations for special cases
17577       // (select (x != 0), -1, 0) -> neg & sbb
17578       // (select (x == 0), 0, -1) -> neg & sbb
17579       if (isNullConstant(Y) &&
17580             (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17581           SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17582           SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17583                                     DAG.getConstant(0, DL,
17584                                                     CmpOp0.getValueType()),
17585                                     CmpOp0);
17586           SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17587                                     DAG.getConstant(X86::COND_B, DL, MVT::i8),
17588                                     SDValue(Neg.getNode(), 1));
17589           return Res;
17590         }
17591
17592       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17593                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17594       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17595
17596       SDValue Res =   // Res = 0 or -1.
17597         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17598                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17599
17600       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17601         Res = DAG.getNOT(DL, Res, Res.getValueType());
17602
17603       if (!isNullConstant(Op2))
17604         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17605       return Res;
17606     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17607                Cmp.getOperand(0).getOpcode() == ISD::AND &&
17608                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17609       SDValue CmpOp0 = Cmp.getOperand(0);
17610       SDValue Src1, Src2;
17611       // true if Op2 is XOR or OR operator and one of its operands
17612       // is equal to Op1
17613       // ( a , a op b) || ( b , a op b)
17614       auto isOrXorPattern = [&]() {
17615         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17616             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17617           Src1 =
17618               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17619           Src2 = Op1;
17620           return true;
17621         }
17622         return false;
17623       };
17624
17625       if (isOrXorPattern()) {
17626         SDValue Neg;
17627         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17628         // we need mask of all zeros or ones with same size of the other
17629         // operands.
17630         if (CmpSz > VT.getSizeInBits())
17631           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17632         else if (CmpSz < VT.getSizeInBits())
17633           Neg = DAG.getNode(ISD::AND, DL, VT,
17634               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17635               DAG.getConstant(1, DL, VT));
17636         else
17637           Neg = CmpOp0;
17638         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17639                                    Neg); // -(and (x, 0x1))
17640         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17641         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
17642       }
17643     }
17644   }
17645
17646   // Look past (and (setcc_carry (cmp ...)), 1).
17647   if (Cond.getOpcode() == ISD::AND &&
17648       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17649       isOneConstant(Cond.getOperand(1)))
17650     Cond = Cond.getOperand(0);
17651
17652   // If condition flag is set by a X86ISD::CMP, then use it as the condition
17653   // setting operand in place of the X86ISD::SETCC.
17654   unsigned CondOpcode = Cond.getOpcode();
17655   if (CondOpcode == X86ISD::SETCC ||
17656       CondOpcode == X86ISD::SETCC_CARRY) {
17657     CC = Cond.getOperand(0);
17658
17659     SDValue Cmp = Cond.getOperand(1);
17660     unsigned Opc = Cmp.getOpcode();
17661     MVT VT = Op.getSimpleValueType();
17662
17663     bool IllegalFPCMov = false;
17664     if (VT.isFloatingPoint() && !VT.isVector() &&
17665         !isScalarFPTypeInSSEReg(VT))  // FPStack?
17666       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17667
17668     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17669         Opc == X86ISD::BT) { // FIXME
17670       Cond = Cmp;
17671       AddTest = false;
17672     }
17673   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17674              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17675              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17676               Cond.getOperand(0).getValueType() != MVT::i8)) {
17677     SDValue LHS = Cond.getOperand(0);
17678     SDValue RHS = Cond.getOperand(1);
17679     unsigned X86Opcode;
17680     unsigned X86Cond;
17681     SDVTList VTs;
17682     switch (CondOpcode) {
17683     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17684     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17685     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17686     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17687     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17688     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17689     default: llvm_unreachable("unexpected overflowing operator");
17690     }
17691     if (CondOpcode == ISD::UMULO)
17692       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17693                           MVT::i32);
17694     else
17695       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17696
17697     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17698
17699     if (CondOpcode == ISD::UMULO)
17700       Cond = X86Op.getValue(2);
17701     else
17702       Cond = X86Op.getValue(1);
17703
17704     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17705     AddTest = false;
17706   }
17707
17708   if (AddTest) {
17709     // Look past the truncate if the high bits are known zero.
17710     if (isTruncWithZeroHighBitsInput(Cond, DAG))
17711       Cond = Cond.getOperand(0);
17712
17713     // We know the result of AND is compared against zero. Try to match
17714     // it to BT.
17715     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17716       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17717         CC = NewSetCC.getOperand(0);
17718         Cond = NewSetCC.getOperand(1);
17719         AddTest = false;
17720       }
17721     }
17722   }
17723
17724   if (AddTest) {
17725     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17726     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17727   }
17728
17729   // a <  b ? -1 :  0 -> RES = ~setcc_carry
17730   // a <  b ?  0 : -1 -> RES = setcc_carry
17731   // a >= b ? -1 :  0 -> RES = setcc_carry
17732   // a >= b ?  0 : -1 -> RES = ~setcc_carry
17733   if (Cond.getOpcode() == X86ISD::SUB) {
17734     Cond = ConvertCmpIfNecessary(Cond, DAG);
17735     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17736
17737     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17738         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17739         (isNullConstant(Op1) || isNullConstant(Op2))) {
17740       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17741                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17742                                 Cond);
17743       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17744         return DAG.getNOT(DL, Res, Res.getValueType());
17745       return Res;
17746     }
17747   }
17748
17749   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17750   // widen the cmov and push the truncate through. This avoids introducing a new
17751   // branch during isel and doesn't add any extensions.
17752   if (Op.getValueType() == MVT::i8 &&
17753       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17754     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17755     if (T1.getValueType() == T2.getValueType() &&
17756         // Blacklist CopyFromReg to avoid partial register stalls.
17757         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17758       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17759       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17760       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17761     }
17762   }
17763
17764   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17765   // condition is true.
17766   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17767   SDValue Ops[] = { Op2, Op1, CC, Cond };
17768   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17769 }
17770
17771 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17772                                        const X86Subtarget &Subtarget,
17773                                        SelectionDAG &DAG) {
17774   MVT VT = Op->getSimpleValueType(0);
17775   SDValue In = Op->getOperand(0);
17776   MVT InVT = In.getSimpleValueType();
17777   MVT VTElt = VT.getVectorElementType();
17778   MVT InVTElt = InVT.getVectorElementType();
17779   SDLoc dl(Op);
17780
17781   // SKX processor
17782   if ((InVTElt == MVT::i1) &&
17783       (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17784
17785        ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17786
17787     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17788
17789   unsigned NumElts = VT.getVectorNumElements();
17790
17791   if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17792       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17793     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17794       return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17795     return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17796   }
17797
17798   if (InVTElt != MVT::i1)
17799     return SDValue();
17800
17801   MVT ExtVT = VT;
17802   if (!VT.is512BitVector() && !Subtarget.hasVLX())
17803     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17804
17805   SDValue V;
17806   if (Subtarget.hasDQI()) {
17807     V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
17808     assert(!VT.is512BitVector() && "Unexpected vector type");
17809   } else {
17810     SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
17811     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17812     V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
17813     if (ExtVT == VT)
17814       return V;
17815   }
17816
17817   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17818 }
17819
17820 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17821 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17822 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17823 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17824 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17825                                         const X86Subtarget &Subtarget,
17826                                         SelectionDAG &DAG) {
17827   SDValue In = Op->getOperand(0);
17828   MVT VT = Op->getSimpleValueType(0);
17829   MVT InVT = In.getSimpleValueType();
17830   assert(VT.getSizeInBits() == InVT.getSizeInBits());
17831
17832   MVT SVT = VT.getVectorElementType();
17833   MVT InSVT = InVT.getVectorElementType();
17834   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17835
17836   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17837     return SDValue();
17838   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17839     return SDValue();
17840   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17841       !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17842       !(VT.is512BitVector() && Subtarget.hasAVX512()))
17843     return SDValue();
17844
17845   SDLoc dl(Op);
17846
17847   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17848   // For 512-bit vectors, we need 128-bits or 256-bits.
17849   if (VT.getSizeInBits() > 128) {
17850     // Input needs to be at least the same number of elements as output, and
17851     // at least 128-bits.
17852     int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17853     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17854   }
17855
17856   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17857           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17858
17859   // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
17860   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
17861   // need to be handled here for 256/512-bit results.
17862   if (Subtarget.hasInt256()) {
17863     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
17864     unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17865                         X86ISD::VSEXT : X86ISD::VZEXT;
17866     return DAG.getNode(ExtOpc, dl, VT, In);
17867   }
17868
17869   // We should only get here for sign extend.
17870   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17871          "Unexpected opcode!");
17872
17873   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17874   SDValue Curr = In;
17875   MVT CurrVT = InVT;
17876
17877   // As SRAI is only available on i16/i32 types, we expand only up to i32
17878   // and handle i64 separately.
17879   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17880     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17881     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17882     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17883     Curr = DAG.getBitcast(CurrVT, Curr);
17884   }
17885
17886   SDValue SignExt = Curr;
17887   if (CurrVT != InVT) {
17888     unsigned SignExtShift =
17889         CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17890     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17891                           DAG.getConstant(SignExtShift, dl, MVT::i8));
17892   }
17893
17894   if (CurrVT == VT)
17895     return SignExt;
17896
17897   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17898     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17899                                DAG.getConstant(31, dl, MVT::i8));
17900     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17901     return DAG.getBitcast(VT, Ext);
17902   }
17903
17904   return SDValue();
17905 }
17906
17907 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17908                                 SelectionDAG &DAG) {
17909   MVT VT = Op->getSimpleValueType(0);
17910   SDValue In = Op->getOperand(0);
17911   MVT InVT = In.getSimpleValueType();
17912   SDLoc dl(Op);
17913
17914   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17915     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17916
17917   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17918       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17919       (VT != MVT::v16i16 || InVT != MVT::v16i8))
17920     return SDValue();
17921
17922   if (Subtarget.hasInt256())
17923     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17924
17925   // Optimize vectors in AVX mode
17926   // Sign extend  v8i16 to v8i32 and
17927   //              v4i32 to v4i64
17928   //
17929   // Divide input vector into two parts
17930   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17931   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17932   // concat the vectors to original VT
17933
17934   unsigned NumElems = InVT.getVectorNumElements();
17935   SDValue Undef = DAG.getUNDEF(InVT);
17936
17937   SmallVector<int,8> ShufMask1(NumElems, -1);
17938   for (unsigned i = 0; i != NumElems/2; ++i)
17939     ShufMask1[i] = i;
17940
17941   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17942
17943   SmallVector<int,8> ShufMask2(NumElems, -1);
17944   for (unsigned i = 0; i != NumElems/2; ++i)
17945     ShufMask2[i] = i + NumElems/2;
17946
17947   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17948
17949   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17950                                 VT.getVectorNumElements() / 2);
17951
17952   OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
17953   OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
17954
17955   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17956 }
17957
17958 // Lower truncating store. We need a special lowering to vXi1 vectors
17959 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17960                                     SelectionDAG &DAG) {
17961   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17962   SDLoc dl(St);
17963   EVT MemVT = St->getMemoryVT();
17964   assert(St->isTruncatingStore() && "We only custom truncating store.");
17965   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17966          "Expected truncstore of i1 vector");
17967
17968   SDValue Op = St->getValue();
17969   MVT OpVT = Op.getValueType().getSimpleVT();
17970   unsigned NumElts = OpVT.getVectorNumElements();
17971   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17972       NumElts == 16) {
17973     // Truncate and store - everything is legal
17974     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17975     if (MemVT.getSizeInBits() < 8)
17976       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17977                        DAG.getUNDEF(MVT::v8i1), Op,
17978                        DAG.getIntPtrConstant(0, dl));
17979     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17980                         St->getMemOperand());
17981   }
17982
17983   // A subset, assume that we have only AVX-512F
17984   if (NumElts <= 8) {
17985     if (NumElts < 8) {
17986       // Extend to 8-elts vector
17987       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17988       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17989                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17990     }
17991     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17992     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17993                         St->getMemOperand());
17994   }
17995   // v32i8
17996   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17997   // Divide the vector into 2 parts and store each part separately
17998   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17999                             DAG.getIntPtrConstant(0, dl));
18000   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18001   SDValue BasePtr = St->getBasePtr();
18002   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18003                               St->getMemOperand());
18004   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18005                             DAG.getIntPtrConstant(16, dl));
18006   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18007
18008   SDValue BasePtrHi =
18009     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18010                 DAG.getConstant(2, dl, BasePtr.getValueType()));
18011
18012   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18013                               BasePtrHi, St->getMemOperand());
18014   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18015 }
18016
18017 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18018                                            const X86Subtarget &Subtarget,
18019                                            SelectionDAG &DAG) {
18020
18021   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18022   SDLoc dl(Ld);
18023   EVT MemVT = Ld->getMemoryVT();
18024   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18025          "Expected i1 vector load");
18026   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18027     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18028   MVT VT = Op.getValueType().getSimpleVT();
18029   unsigned NumElts = VT.getVectorNumElements();
18030
18031   if ((Subtarget.hasBWI() && NumElts >= 32) ||
18032       (Subtarget.hasDQI() && NumElts < 16) ||
18033       NumElts == 16) {
18034     // Load and extend - everything is legal
18035     if (NumElts < 8) {
18036       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18037                                  Ld->getBasePtr(),
18038                                  Ld->getMemOperand());
18039       // Replace chain users with the new chain.
18040       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18041       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18042       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18043       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18044
18045       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18046                                    DAG.getIntPtrConstant(0, dl));
18047     }
18048     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18049                                Ld->getBasePtr(),
18050                                Ld->getMemOperand());
18051     // Replace chain users with the new chain.
18052     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18053     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18054
18055     // Finally, do a normal sign-extend to the desired register.
18056     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18057   }
18058
18059   if (NumElts <= 8) {
18060     // A subset, assume that we have only AVX-512F
18061     unsigned NumBitsToLoad = 8;
18062     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18063     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18064                               Ld->getBasePtr(),
18065                               Ld->getMemOperand());
18066     // Replace chain users with the new chain.
18067     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18068     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18069
18070     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18071     SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18072
18073     if (NumElts == 8)
18074       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18075
18076       // we should take care to v4i1 and v2i1
18077
18078     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18079     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18080     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18081                         DAG.getIntPtrConstant(0, dl));
18082   }
18083
18084   assert(VT == MVT::v32i8 && "Unexpected extload type");
18085
18086   SmallVector<SDValue, 2> Chains;
18087
18088   SDValue BasePtr = Ld->getBasePtr();
18089   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18090                                Ld->getBasePtr(),
18091                                Ld->getMemOperand());
18092   Chains.push_back(LoadLo.getValue(1));
18093
18094   SDValue BasePtrHi =
18095     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18096                 DAG.getConstant(2, dl, BasePtr.getValueType()));
18097
18098   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18099                                BasePtrHi,
18100                                Ld->getMemOperand());
18101   Chains.push_back(LoadHi.getValue(1));
18102   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18103   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18104
18105   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18106   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18107   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18108 }
18109
18110 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18111 // may emit an illegal shuffle but the expansion is still better than scalar
18112 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18113 // we'll emit a shuffle and a arithmetic shift.
18114 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18115 // TODO: It is possible to support ZExt by zeroing the undef values during
18116 // the shuffle phase or after the shuffle.
18117 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18118                                  SelectionDAG &DAG) {
18119   MVT RegVT = Op.getSimpleValueType();
18120   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18121   assert(RegVT.isInteger() &&
18122          "We only custom lower integer vector sext loads.");
18123
18124   // Nothing useful we can do without SSE2 shuffles.
18125   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18126
18127   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18128   SDLoc dl(Ld);
18129   EVT MemVT = Ld->getMemoryVT();
18130   if (MemVT.getScalarType() == MVT::i1)
18131     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18132
18133   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18134   unsigned RegSz = RegVT.getSizeInBits();
18135
18136   ISD::LoadExtType Ext = Ld->getExtensionType();
18137
18138   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18139          && "Only anyext and sext are currently implemented.");
18140   assert(MemVT != RegVT && "Cannot extend to the same type");
18141   assert(MemVT.isVector() && "Must load a vector from memory");
18142
18143   unsigned NumElems = RegVT.getVectorNumElements();
18144   unsigned MemSz = MemVT.getSizeInBits();
18145   assert(RegSz > MemSz && "Register size must be greater than the mem size");
18146
18147   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18148     // The only way in which we have a legal 256-bit vector result but not the
18149     // integer 256-bit operations needed to directly lower a sextload is if we
18150     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18151     // a 128-bit vector and a normal sign_extend to 256-bits that should get
18152     // correctly legalized. We do this late to allow the canonical form of
18153     // sextload to persist throughout the rest of the DAG combiner -- it wants
18154     // to fold together any extensions it can, and so will fuse a sign_extend
18155     // of an sextload into a sextload targeting a wider value.
18156     SDValue Load;
18157     if (MemSz == 128) {
18158       // Just switch this to a normal load.
18159       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18160                                        "it must be a legal 128-bit vector "
18161                                        "type!");
18162       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18163                          Ld->getPointerInfo(), Ld->getAlignment(),
18164                          Ld->getMemOperand()->getFlags());
18165     } else {
18166       assert(MemSz < 128 &&
18167              "Can't extend a type wider than 128 bits to a 256 bit vector!");
18168       // Do an sext load to a 128-bit vector type. We want to use the same
18169       // number of elements, but elements half as wide. This will end up being
18170       // recursively lowered by this routine, but will succeed as we definitely
18171       // have all the necessary features if we're using AVX1.
18172       EVT HalfEltVT =
18173           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18174       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18175       Load =
18176           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18177                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18178                          Ld->getMemOperand()->getFlags());
18179     }
18180
18181     // Replace chain users with the new chain.
18182     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18183     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18184
18185     // Finally, do a normal sign-extend to the desired register.
18186     return DAG.getSExtOrTrunc(Load, dl, RegVT);
18187   }
18188
18189   // All sizes must be a power of two.
18190   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18191          "Non-power-of-two elements are not custom lowered!");
18192
18193   // Attempt to load the original value using scalar loads.
18194   // Find the largest scalar type that divides the total loaded size.
18195   MVT SclrLoadTy = MVT::i8;
18196   for (MVT Tp : MVT::integer_valuetypes()) {
18197     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18198       SclrLoadTy = Tp;
18199     }
18200   }
18201
18202   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18203   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18204       (64 <= MemSz))
18205     SclrLoadTy = MVT::f64;
18206
18207   // Calculate the number of scalar loads that we need to perform
18208   // in order to load our vector from memory.
18209   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18210
18211   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18212          "Can only lower sext loads with a single scalar load!");
18213
18214   unsigned loadRegZize = RegSz;
18215   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18216     loadRegZize = 128;
18217
18218   // Represent our vector as a sequence of elements which are the
18219   // largest scalar that we can load.
18220   EVT LoadUnitVecVT = EVT::getVectorVT(
18221       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18222
18223   // Represent the data using the same element type that is stored in
18224   // memory. In practice, we ''widen'' MemVT.
18225   EVT WideVecVT =
18226       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18227                        loadRegZize / MemVT.getScalarSizeInBits());
18228
18229   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18230          "Invalid vector type");
18231
18232   // We can't shuffle using an illegal type.
18233   assert(TLI.isTypeLegal(WideVecVT) &&
18234          "We only lower types that form legal widened vector types");
18235
18236   SmallVector<SDValue, 8> Chains;
18237   SDValue Ptr = Ld->getBasePtr();
18238   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18239                                       TLI.getPointerTy(DAG.getDataLayout()));
18240   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18241
18242   for (unsigned i = 0; i < NumLoads; ++i) {
18243     // Perform a single load.
18244     SDValue ScalarLoad =
18245         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18246                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18247     Chains.push_back(ScalarLoad.getValue(1));
18248     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18249     // another round of DAGCombining.
18250     if (i == 0)
18251       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18252     else
18253       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18254                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
18255
18256     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18257   }
18258
18259   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18260
18261   // Bitcast the loaded value to a vector of the original element type, in
18262   // the size of the target vector type.
18263   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18264   unsigned SizeRatio = RegSz / MemSz;
18265
18266   if (Ext == ISD::SEXTLOAD) {
18267     // If we have SSE4.1, we can directly emit a VSEXT node.
18268     if (Subtarget.hasSSE41()) {
18269       SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18270       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18271       return Sext;
18272     }
18273
18274     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18275     // lanes.
18276     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18277            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18278
18279     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18280     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18281     return Shuff;
18282   }
18283
18284   // Redistribute the loaded elements into the different locations.
18285   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18286   for (unsigned i = 0; i != NumElems; ++i)
18287     ShuffleVec[i * SizeRatio] = i;
18288
18289   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18290                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
18291
18292   // Bitcast to the requested type.
18293   Shuff = DAG.getBitcast(RegVT, Shuff);
18294   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18295   return Shuff;
18296 }
18297
18298 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18299 /// each of which has no other use apart from the AND / OR.
18300 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18301   Opc = Op.getOpcode();
18302   if (Opc != ISD::OR && Opc != ISD::AND)
18303     return false;
18304   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18305           Op.getOperand(0).hasOneUse() &&
18306           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18307           Op.getOperand(1).hasOneUse());
18308 }
18309
18310 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18311 /// SETCC node has a single use.
18312 static bool isXor1OfSetCC(SDValue Op) {
18313   if (Op.getOpcode() != ISD::XOR)
18314     return false;
18315   if (isOneConstant(Op.getOperand(1)))
18316     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18317            Op.getOperand(0).hasOneUse();
18318   return false;
18319 }
18320
18321 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18322   bool addTest = true;
18323   SDValue Chain = Op.getOperand(0);
18324   SDValue Cond  = Op.getOperand(1);
18325   SDValue Dest  = Op.getOperand(2);
18326   SDLoc dl(Op);
18327   SDValue CC;
18328   bool Inverted = false;
18329
18330   if (Cond.getOpcode() == ISD::SETCC) {
18331     // Check for setcc([su]{add,sub,mul}o == 0).
18332     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18333         isNullConstant(Cond.getOperand(1)) &&
18334         Cond.getOperand(0).getResNo() == 1 &&
18335         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18336          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18337          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18338          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18339          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18340          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18341       Inverted = true;
18342       Cond = Cond.getOperand(0);
18343     } else {
18344       if (SDValue NewCond = LowerSETCC(Cond, DAG))
18345         Cond = NewCond;
18346     }
18347   }
18348 #if 0
18349   // FIXME: LowerXALUO doesn't handle these!!
18350   else if (Cond.getOpcode() == X86ISD::ADD  ||
18351            Cond.getOpcode() == X86ISD::SUB  ||
18352            Cond.getOpcode() == X86ISD::SMUL ||
18353            Cond.getOpcode() == X86ISD::UMUL)
18354     Cond = LowerXALUO(Cond, DAG);
18355 #endif
18356
18357   // Look pass (and (setcc_carry (cmp ...)), 1).
18358   if (Cond.getOpcode() == ISD::AND &&
18359       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18360       isOneConstant(Cond.getOperand(1)))
18361     Cond = Cond.getOperand(0);
18362
18363   // If condition flag is set by a X86ISD::CMP, then use it as the condition
18364   // setting operand in place of the X86ISD::SETCC.
18365   unsigned CondOpcode = Cond.getOpcode();
18366   if (CondOpcode == X86ISD::SETCC ||
18367       CondOpcode == X86ISD::SETCC_CARRY) {
18368     CC = Cond.getOperand(0);
18369
18370     SDValue Cmp = Cond.getOperand(1);
18371     unsigned Opc = Cmp.getOpcode();
18372     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18373     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18374       Cond = Cmp;
18375       addTest = false;
18376     } else {
18377       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18378       default: break;
18379       case X86::COND_O:
18380       case X86::COND_B:
18381         // These can only come from an arithmetic instruction with overflow,
18382         // e.g. SADDO, UADDO.
18383         Cond = Cond.getOperand(1);
18384         addTest = false;
18385         break;
18386       }
18387     }
18388   }
18389   CondOpcode = Cond.getOpcode();
18390   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18391       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18392       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18393        Cond.getOperand(0).getValueType() != MVT::i8)) {
18394     SDValue LHS = Cond.getOperand(0);
18395     SDValue RHS = Cond.getOperand(1);
18396     unsigned X86Opcode;
18397     unsigned X86Cond;
18398     SDVTList VTs;
18399     // Keep this in sync with LowerXALUO, otherwise we might create redundant
18400     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18401     // X86ISD::INC).
18402     switch (CondOpcode) {
18403     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18404     case ISD::SADDO:
18405       if (isOneConstant(RHS)) {
18406           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18407           break;
18408         }
18409       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18410     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18411     case ISD::SSUBO:
18412       if (isOneConstant(RHS)) {
18413           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18414           break;
18415         }
18416       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18417     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18418     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18419     default: llvm_unreachable("unexpected overflowing operator");
18420     }
18421     if (Inverted)
18422       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18423     if (CondOpcode == ISD::UMULO)
18424       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18425                           MVT::i32);
18426     else
18427       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18428
18429     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18430
18431     if (CondOpcode == ISD::UMULO)
18432       Cond = X86Op.getValue(2);
18433     else
18434       Cond = X86Op.getValue(1);
18435
18436     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18437     addTest = false;
18438   } else {
18439     unsigned CondOpc;
18440     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18441       SDValue Cmp = Cond.getOperand(0).getOperand(1);
18442       if (CondOpc == ISD::OR) {
18443         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18444         // two branches instead of an explicit OR instruction with a
18445         // separate test.
18446         if (Cmp == Cond.getOperand(1).getOperand(1) &&
18447             isX86LogicalCmp(Cmp)) {
18448           CC = Cond.getOperand(0).getOperand(0);
18449           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18450                               Chain, Dest, CC, Cmp);
18451           CC = Cond.getOperand(1).getOperand(0);
18452           Cond = Cmp;
18453           addTest = false;
18454         }
18455       } else { // ISD::AND
18456         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18457         // two branches instead of an explicit AND instruction with a
18458         // separate test. However, we only do this if this block doesn't
18459         // have a fall-through edge, because this requires an explicit
18460         // jmp when the condition is false.
18461         if (Cmp == Cond.getOperand(1).getOperand(1) &&
18462             isX86LogicalCmp(Cmp) &&
18463             Op.getNode()->hasOneUse()) {
18464           X86::CondCode CCode =
18465             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18466           CCode = X86::GetOppositeBranchCondition(CCode);
18467           CC = DAG.getConstant(CCode, dl, MVT::i8);
18468           SDNode *User = *Op.getNode()->use_begin();
18469           // Look for an unconditional branch following this conditional branch.
18470           // We need this because we need to reverse the successors in order
18471           // to implement FCMP_OEQ.
18472           if (User->getOpcode() == ISD::BR) {
18473             SDValue FalseBB = User->getOperand(1);
18474             SDNode *NewBR =
18475               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18476             assert(NewBR == User);
18477             (void)NewBR;
18478             Dest = FalseBB;
18479
18480             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18481                                 Chain, Dest, CC, Cmp);
18482             X86::CondCode CCode =
18483               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18484             CCode = X86::GetOppositeBranchCondition(CCode);
18485             CC = DAG.getConstant(CCode, dl, MVT::i8);
18486             Cond = Cmp;
18487             addTest = false;
18488           }
18489         }
18490       }
18491     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18492       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18493       // It should be transformed during dag combiner except when the condition
18494       // is set by a arithmetics with overflow node.
18495       X86::CondCode CCode =
18496         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18497       CCode = X86::GetOppositeBranchCondition(CCode);
18498       CC = DAG.getConstant(CCode, dl, MVT::i8);
18499       Cond = Cond.getOperand(0).getOperand(1);
18500       addTest = false;
18501     } else if (Cond.getOpcode() == ISD::SETCC &&
18502                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18503       // For FCMP_OEQ, we can emit
18504       // two branches instead of an explicit AND instruction with a
18505       // separate test. However, we only do this if this block doesn't
18506       // have a fall-through edge, because this requires an explicit
18507       // jmp when the condition is false.
18508       if (Op.getNode()->hasOneUse()) {
18509         SDNode *User = *Op.getNode()->use_begin();
18510         // Look for an unconditional branch following this conditional branch.
18511         // We need this because we need to reverse the successors in order
18512         // to implement FCMP_OEQ.
18513         if (User->getOpcode() == ISD::BR) {
18514           SDValue FalseBB = User->getOperand(1);
18515           SDNode *NewBR =
18516             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18517           assert(NewBR == User);
18518           (void)NewBR;
18519           Dest = FalseBB;
18520
18521           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18522                                     Cond.getOperand(0), Cond.getOperand(1));
18523           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18524           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18525           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18526                               Chain, Dest, CC, Cmp);
18527           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18528           Cond = Cmp;
18529           addTest = false;
18530         }
18531       }
18532     } else if (Cond.getOpcode() == ISD::SETCC &&
18533                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18534       // For FCMP_UNE, we can emit
18535       // two branches instead of an explicit AND instruction with a
18536       // separate test. However, we only do this if this block doesn't
18537       // have a fall-through edge, because this requires an explicit
18538       // jmp when the condition is false.
18539       if (Op.getNode()->hasOneUse()) {
18540         SDNode *User = *Op.getNode()->use_begin();
18541         // Look for an unconditional branch following this conditional branch.
18542         // We need this because we need to reverse the successors in order
18543         // to implement FCMP_UNE.
18544         if (User->getOpcode() == ISD::BR) {
18545           SDValue FalseBB = User->getOperand(1);
18546           SDNode *NewBR =
18547             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18548           assert(NewBR == User);
18549           (void)NewBR;
18550
18551           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18552                                     Cond.getOperand(0), Cond.getOperand(1));
18553           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18554           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18555           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18556                               Chain, Dest, CC, Cmp);
18557           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18558           Cond = Cmp;
18559           addTest = false;
18560           Dest = FalseBB;
18561         }
18562       }
18563     }
18564   }
18565
18566   if (addTest) {
18567     // Look pass the truncate if the high bits are known zero.
18568     if (isTruncWithZeroHighBitsInput(Cond, DAG))
18569         Cond = Cond.getOperand(0);
18570
18571     // We know the result is compared against zero. Try to match it to BT.
18572     if (Cond.hasOneUse()) {
18573       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18574         CC = NewSetCC.getOperand(0);
18575         Cond = NewSetCC.getOperand(1);
18576         addTest = false;
18577       }
18578     }
18579   }
18580
18581   if (addTest) {
18582     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18583     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18584     Cond = EmitTest(Cond, X86Cond, dl, DAG);
18585   }
18586   Cond = ConvertCmpIfNecessary(Cond, DAG);
18587   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18588                      Chain, Dest, CC, Cond);
18589 }
18590
18591 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18592 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18593 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18594 // that the guard pages used by the OS virtual memory manager are allocated in
18595 // correct sequence.
18596 SDValue
18597 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18598                                            SelectionDAG &DAG) const {
18599   MachineFunction &MF = DAG.getMachineFunction();
18600   bool SplitStack = MF.shouldSplitStack();
18601   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18602                SplitStack;
18603   SDLoc dl(Op);
18604
18605   // Get the inputs.
18606   SDNode *Node = Op.getNode();
18607   SDValue Chain = Op.getOperand(0);
18608   SDValue Size  = Op.getOperand(1);
18609   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18610   EVT VT = Node->getValueType(0);
18611
18612   // Chain the dynamic stack allocation so that it doesn't modify the stack
18613   // pointer when other instructions are using the stack.
18614   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18615
18616   bool Is64Bit = Subtarget.is64Bit();
18617   MVT SPTy = getPointerTy(DAG.getDataLayout());
18618
18619   SDValue Result;
18620   if (!Lower) {
18621     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18622     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18623     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18624                     " not tell us which reg is the stack pointer!");
18625
18626     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18627     Chain = SP.getValue(1);
18628     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18629     unsigned StackAlign = TFI.getStackAlignment();
18630     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18631     if (Align > StackAlign)
18632       Result = DAG.getNode(ISD::AND, dl, VT, Result,
18633                          DAG.getConstant(-(uint64_t)Align, dl, VT));
18634     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18635   } else if (SplitStack) {
18636     MachineRegisterInfo &MRI = MF.getRegInfo();
18637
18638     if (Is64Bit) {
18639       // The 64 bit implementation of segmented stacks needs to clobber both r10
18640       // r11. This makes it impossible to use it along with nested parameters.
18641       const Function *F = MF.getFunction();
18642       for (const auto &A : F->args()) {
18643         if (A.hasNestAttr())
18644           report_fatal_error("Cannot use segmented stacks with functions that "
18645                              "have nested arguments.");
18646       }
18647     }
18648
18649     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18650     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18651     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18652     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18653                                 DAG.getRegister(Vreg, SPTy));
18654   } else {
18655     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18656     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18657     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18658
18659     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18660     unsigned SPReg = RegInfo->getStackRegister();
18661     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18662     Chain = SP.getValue(1);
18663
18664     if (Align) {
18665       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18666                        DAG.getConstant(-(uint64_t)Align, dl, VT));
18667       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18668     }
18669
18670     Result = SP;
18671   }
18672
18673   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18674                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18675
18676   SDValue Ops[2] = {Result, Chain};
18677   return DAG.getMergeValues(Ops, dl);
18678 }
18679
18680 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18681   MachineFunction &MF = DAG.getMachineFunction();
18682   auto PtrVT = getPointerTy(MF.getDataLayout());
18683   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18684
18685   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18686   SDLoc DL(Op);
18687
18688   if (!Subtarget.is64Bit() ||
18689       Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18690     // vastart just stores the address of the VarArgsFrameIndex slot into the
18691     // memory location argument.
18692     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18693     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18694                         MachinePointerInfo(SV));
18695   }
18696
18697   // __va_list_tag:
18698   //   gp_offset         (0 - 6 * 8)
18699   //   fp_offset         (48 - 48 + 8 * 16)
18700   //   overflow_arg_area (point to parameters coming in memory).
18701   //   reg_save_area
18702   SmallVector<SDValue, 8> MemOps;
18703   SDValue FIN = Op.getOperand(1);
18704   // Store gp_offset
18705   SDValue Store = DAG.getStore(
18706       Op.getOperand(0), DL,
18707       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18708       MachinePointerInfo(SV));
18709   MemOps.push_back(Store);
18710
18711   // Store fp_offset
18712   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18713   Store = DAG.getStore(
18714       Op.getOperand(0), DL,
18715       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18716       MachinePointerInfo(SV, 4));
18717   MemOps.push_back(Store);
18718
18719   // Store ptr to overflow_arg_area
18720   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18721   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18722   Store =
18723       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18724   MemOps.push_back(Store);
18725
18726   // Store ptr to reg_save_area.
18727   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18728       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18729   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18730   Store = DAG.getStore(
18731       Op.getOperand(0), DL, RSFIN, FIN,
18732       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18733   MemOps.push_back(Store);
18734   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18735 }
18736
18737 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18738   assert(Subtarget.is64Bit() &&
18739          "LowerVAARG only handles 64-bit va_arg!");
18740   assert(Op.getNumOperands() == 4);
18741
18742   MachineFunction &MF = DAG.getMachineFunction();
18743   if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18744     // The Win64 ABI uses char* instead of a structure.
18745     return DAG.expandVAArg(Op.getNode());
18746
18747   SDValue Chain = Op.getOperand(0);
18748   SDValue SrcPtr = Op.getOperand(1);
18749   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18750   unsigned Align = Op.getConstantOperandVal(3);
18751   SDLoc dl(Op);
18752
18753   EVT ArgVT = Op.getNode()->getValueType(0);
18754   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18755   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18756   uint8_t ArgMode;
18757
18758   // Decide which area this value should be read from.
18759   // TODO: Implement the AMD64 ABI in its entirety. This simple
18760   // selection mechanism works only for the basic types.
18761   if (ArgVT == MVT::f80) {
18762     llvm_unreachable("va_arg for f80 not yet implemented");
18763   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18764     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
18765   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18766     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
18767   } else {
18768     llvm_unreachable("Unhandled argument type in LowerVAARG");
18769   }
18770
18771   if (ArgMode == 2) {
18772     // Sanity Check: Make sure using fp_offset makes sense.
18773     assert(!Subtarget.useSoftFloat() &&
18774            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18775            Subtarget.hasSSE1());
18776   }
18777
18778   // Insert VAARG_64 node into the DAG
18779   // VAARG_64 returns two values: Variable Argument Address, Chain
18780   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18781                        DAG.getConstant(ArgMode, dl, MVT::i8),
18782                        DAG.getConstant(Align, dl, MVT::i32)};
18783   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18784   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18785                                           VTs, InstOps, MVT::i64,
18786                                           MachinePointerInfo(SV),
18787                                           /*Align=*/0,
18788                                           /*Volatile=*/false,
18789                                           /*ReadMem=*/true,
18790                                           /*WriteMem=*/true);
18791   Chain = VAARG.getValue(1);
18792
18793   // Load the next argument and return it
18794   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18795 }
18796
18797 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18798                            SelectionDAG &DAG) {
18799   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18800   // where a va_list is still an i8*.
18801   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18802   if (Subtarget.isCallingConvWin64(
18803         DAG.getMachineFunction().getFunction()->getCallingConv()))
18804     // Probably a Win64 va_copy.
18805     return DAG.expandVACopy(Op.getNode());
18806
18807   SDValue Chain = Op.getOperand(0);
18808   SDValue DstPtr = Op.getOperand(1);
18809   SDValue SrcPtr = Op.getOperand(2);
18810   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18811   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18812   SDLoc DL(Op);
18813
18814   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18815                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18816                        false, false,
18817                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18818 }
18819
18820 /// Handle vector element shifts where the shift amount is a constant.
18821 /// Takes immediate version of shift as input.
18822 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18823                                           SDValue SrcOp, uint64_t ShiftAmt,
18824                                           SelectionDAG &DAG) {
18825   MVT ElementType = VT.getVectorElementType();
18826
18827   // Bitcast the source vector to the output type, this is mainly necessary for
18828   // vXi8/vXi64 shifts.
18829   if (VT != SrcOp.getSimpleValueType())
18830     SrcOp = DAG.getBitcast(VT, SrcOp);
18831
18832   // Fold this packed shift into its first operand if ShiftAmt is 0.
18833   if (ShiftAmt == 0)
18834     return SrcOp;
18835
18836   // Check for ShiftAmt >= element width
18837   if (ShiftAmt >= ElementType.getSizeInBits()) {
18838     if (Opc == X86ISD::VSRAI)
18839       ShiftAmt = ElementType.getSizeInBits() - 1;
18840     else
18841       return DAG.getConstant(0, dl, VT);
18842   }
18843
18844   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18845          && "Unknown target vector shift-by-constant node");
18846
18847   // Fold this packed vector shift into a build vector if SrcOp is a
18848   // vector of Constants or UNDEFs.
18849   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18850     SmallVector<SDValue, 8> Elts;
18851     unsigned NumElts = SrcOp->getNumOperands();
18852     ConstantSDNode *ND;
18853
18854     switch(Opc) {
18855     default: llvm_unreachable("Unknown opcode!");
18856     case X86ISD::VSHLI:
18857       for (unsigned i=0; i!=NumElts; ++i) {
18858         SDValue CurrentOp = SrcOp->getOperand(i);
18859         if (CurrentOp->isUndef()) {
18860           Elts.push_back(CurrentOp);
18861           continue;
18862         }
18863         ND = cast<ConstantSDNode>(CurrentOp);
18864         const APInt &C = ND->getAPIntValue();
18865         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18866       }
18867       break;
18868     case X86ISD::VSRLI:
18869       for (unsigned i=0; i!=NumElts; ++i) {
18870         SDValue CurrentOp = SrcOp->getOperand(i);
18871         if (CurrentOp->isUndef()) {
18872           Elts.push_back(CurrentOp);
18873           continue;
18874         }
18875         ND = cast<ConstantSDNode>(CurrentOp);
18876         const APInt &C = ND->getAPIntValue();
18877         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18878       }
18879       break;
18880     case X86ISD::VSRAI:
18881       for (unsigned i=0; i!=NumElts; ++i) {
18882         SDValue CurrentOp = SrcOp->getOperand(i);
18883         if (CurrentOp->isUndef()) {
18884           Elts.push_back(CurrentOp);
18885           continue;
18886         }
18887         ND = cast<ConstantSDNode>(CurrentOp);
18888         const APInt &C = ND->getAPIntValue();
18889         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18890       }
18891       break;
18892     }
18893
18894     return DAG.getBuildVector(VT, dl, Elts);
18895   }
18896
18897   return DAG.getNode(Opc, dl, VT, SrcOp,
18898                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
18899 }
18900
18901 /// Handle vector element shifts where the shift amount may or may not be a
18902 /// constant. Takes immediate version of shift as input.
18903 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18904                                    SDValue SrcOp, SDValue ShAmt,
18905                                    const X86Subtarget &Subtarget,
18906                                    SelectionDAG &DAG) {
18907   MVT SVT = ShAmt.getSimpleValueType();
18908   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18909
18910   // Catch shift-by-constant.
18911   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18912     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18913                                       CShAmt->getZExtValue(), DAG);
18914
18915   // Change opcode to non-immediate version
18916   switch (Opc) {
18917     default: llvm_unreachable("Unknown target vector shift node");
18918     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18919     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18920     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18921   }
18922
18923   // Need to build a vector containing shift amount.
18924   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18925   // +=================+============+=======================================+
18926   // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
18927   // +=================+============+=======================================+
18928   // | i64             | Yes, No    | Use ShAmt as lowest elt               |
18929   // | i32             | Yes        | zero-extend in-reg                    |
18930   // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
18931   // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18932   // +=================+============+=======================================+
18933
18934   if (SVT == MVT::i64)
18935     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18936   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18937            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18938     ShAmt = ShAmt.getOperand(0);
18939     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18940     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18941   } else if (Subtarget.hasSSE41() &&
18942              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18943     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18944     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18945   } else {
18946     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18947                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18948     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18949   }
18950
18951   // The return type has to be a 128-bit type with the same element
18952   // type as the input type.
18953   MVT EltVT = VT.getVectorElementType();
18954   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18955
18956   ShAmt = DAG.getBitcast(ShVT, ShAmt);
18957   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18958 }
18959
18960 /// \brief Return Mask with the necessary casting or extending
18961 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18962 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18963                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
18964                            const SDLoc &dl) {
18965
18966   if (isAllOnesConstant(Mask))
18967     return DAG.getTargetConstant(1, dl, MaskVT);
18968   if (X86::isZeroNode(Mask))
18969     return DAG.getTargetConstant(0, dl, MaskVT);
18970
18971   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18972     // Mask should be extended
18973     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18974                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18975   }
18976
18977   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18978     if (MaskVT == MVT::v64i1) {
18979       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18980       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18981       SDValue Lo, Hi;
18982       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18983                           DAG.getConstant(0, dl, MVT::i32));
18984       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18985                           DAG.getConstant(1, dl, MVT::i32));
18986
18987       Lo = DAG.getBitcast(MVT::v32i1, Lo);
18988       Hi = DAG.getBitcast(MVT::v32i1, Hi);
18989
18990       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18991     } else {
18992       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18993       // and bitcast.
18994       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18995       return DAG.getBitcast(MaskVT,
18996                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18997     }
18998
18999   } else {
19000     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19001                                      Mask.getSimpleValueType().getSizeInBits());
19002     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19003     // are extracted by EXTRACT_SUBVECTOR.
19004     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19005                        DAG.getBitcast(BitcastVT, Mask),
19006                        DAG.getIntPtrConstant(0, dl));
19007   }
19008 }
19009
19010 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19011 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19012 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19013 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19014                   SDValue PreservedSrc,
19015                   const X86Subtarget &Subtarget,
19016                   SelectionDAG &DAG) {
19017   MVT VT = Op.getSimpleValueType();
19018   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19019   unsigned OpcodeSelect = ISD::VSELECT;
19020   SDLoc dl(Op);
19021
19022   if (isAllOnesConstant(Mask))
19023     return Op;
19024
19025   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19026
19027   switch (Op.getOpcode()) {
19028   default: break;
19029   case X86ISD::PCMPEQM:
19030   case X86ISD::PCMPGTM:
19031   case X86ISD::CMPM:
19032   case X86ISD::CMPMU:
19033     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19034   case X86ISD::VFPCLASS:
19035     case X86ISD::VFPCLASSS:
19036     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19037   case X86ISD::VTRUNC:
19038   case X86ISD::VTRUNCS:
19039   case X86ISD::VTRUNCUS:
19040   case X86ISD::CVTPS2PH:
19041     // We can't use ISD::VSELECT here because it is not always "Legal"
19042     // for the destination type. For example vpmovqb require only AVX512
19043     // and vselect that can operate on byte element type require BWI
19044     OpcodeSelect = X86ISD::SELECT;
19045     break;
19046   }
19047   if (PreservedSrc.isUndef())
19048     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19049   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19050 }
19051
19052 /// \brief Creates an SDNode for a predicated scalar operation.
19053 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19054 /// The mask is coming as MVT::i8 and it should be transformed
19055 /// to MVT::v1i1 while lowering masking intrinsics.
19056 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19057 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19058 /// for a scalar instruction.
19059 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19060                                     SDValue PreservedSrc,
19061                                     const X86Subtarget &Subtarget,
19062                                     SelectionDAG &DAG) {
19063
19064   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19065     if (MaskConst->getZExtValue() & 0x1)
19066       return Op;
19067
19068   MVT VT = Op.getSimpleValueType();
19069   SDLoc dl(Op);
19070
19071   SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19072   if (Op.getOpcode() == X86ISD::FSETCCM ||
19073       Op.getOpcode() == X86ISD::FSETCCM_RND)
19074     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19075   if (Op.getOpcode() == X86ISD::VFPCLASSS)
19076     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19077
19078   if (PreservedSrc.isUndef())
19079     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19080   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19081 }
19082
19083 static int getSEHRegistrationNodeSize(const Function *Fn) {
19084   if (!Fn->hasPersonalityFn())
19085     report_fatal_error(
19086         "querying registration node size for function without personality");
19087   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19088   // WinEHStatePass for the full struct definition.
19089   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19090   case EHPersonality::MSVC_X86SEH: return 24;
19091   case EHPersonality::MSVC_CXX: return 16;
19092   default: break;
19093   }
19094   report_fatal_error(
19095       "can only recover FP for 32-bit MSVC EH personality functions");
19096 }
19097
19098 /// When the MSVC runtime transfers control to us, either to an outlined
19099 /// function or when returning to a parent frame after catching an exception, we
19100 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19101 /// Here's the math:
19102 ///   RegNodeBase = EntryEBP - RegNodeSize
19103 ///   ParentFP = RegNodeBase - ParentFrameOffset
19104 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19105 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19106 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19107                                    SDValue EntryEBP) {
19108   MachineFunction &MF = DAG.getMachineFunction();
19109   SDLoc dl;
19110
19111   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19112   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19113
19114   // It's possible that the parent function no longer has a personality function
19115   // if the exceptional code was optimized away, in which case we just return
19116   // the incoming EBP.
19117   if (!Fn->hasPersonalityFn())
19118     return EntryEBP;
19119
19120   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19121   // registration, or the .set_setframe offset.
19122   MCSymbol *OffsetSym =
19123       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19124           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19125   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19126   SDValue ParentFrameOffset =
19127       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19128
19129   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19130   // prologue to RBP in the parent function.
19131   const X86Subtarget &Subtarget =
19132       static_cast<const X86Subtarget &>(DAG.getSubtarget());
19133   if (Subtarget.is64Bit())
19134     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19135
19136   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19137   // RegNodeBase = EntryEBP - RegNodeSize
19138   // ParentFP = RegNodeBase - ParentFrameOffset
19139   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19140                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
19141   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19142 }
19143
19144 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19145                                        SelectionDAG &DAG) {
19146   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19147   auto isRoundModeCurDirection = [](SDValue Rnd) {
19148     if (!isa<ConstantSDNode>(Rnd))
19149       return false;
19150
19151     unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19152     return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19153   };
19154
19155   SDLoc dl(Op);
19156   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19157   MVT VT = Op.getSimpleValueType();
19158   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19159   if (IntrData) {
19160     switch(IntrData->Type) {
19161     case INTR_TYPE_1OP:
19162       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19163     case INTR_TYPE_2OP:
19164       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19165         Op.getOperand(2));
19166     case INTR_TYPE_3OP:
19167       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19168         Op.getOperand(2), Op.getOperand(3));
19169     case INTR_TYPE_4OP:
19170       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19171         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19172     case INTR_TYPE_1OP_MASK_RM: {
19173       SDValue Src = Op.getOperand(1);
19174       SDValue PassThru = Op.getOperand(2);
19175       SDValue Mask = Op.getOperand(3);
19176       SDValue RoundingMode;
19177       // We always add rounding mode to the Node.
19178       // If the rounding mode is not specified, we add the
19179       // "current direction" mode.
19180       if (Op.getNumOperands() == 4)
19181         RoundingMode =
19182           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19183       else
19184         RoundingMode = Op.getOperand(4);
19185       assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19186       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19187                                               RoundingMode),
19188                                   Mask, PassThru, Subtarget, DAG);
19189     }
19190     case INTR_TYPE_1OP_MASK: {
19191       SDValue Src = Op.getOperand(1);
19192       SDValue PassThru = Op.getOperand(2);
19193       SDValue Mask = Op.getOperand(3);
19194       // We add rounding mode to the Node when
19195       //   - RM Opcode is specified and
19196       //   - RM is not "current direction".
19197       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19198       if (IntrWithRoundingModeOpcode != 0) {
19199         SDValue Rnd = Op.getOperand(4);
19200         if (!isRoundModeCurDirection(Rnd)) {
19201           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19202                                       dl, Op.getValueType(),
19203                                       Src, Rnd),
19204                                       Mask, PassThru, Subtarget, DAG);
19205         }
19206       }
19207       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19208                                   Mask, PassThru, Subtarget, DAG);
19209     }
19210     case INTR_TYPE_SCALAR_MASK: {
19211       SDValue Src1 = Op.getOperand(1);
19212       SDValue Src2 = Op.getOperand(2);
19213       SDValue passThru = Op.getOperand(3);
19214       SDValue Mask = Op.getOperand(4);
19215       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19216       if (IntrWithRoundingModeOpcode != 0) {
19217         SDValue Rnd = Op.getOperand(5);
19218         if (!isRoundModeCurDirection(Rnd))
19219           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19220                                                   dl, VT, Src1, Src2, Rnd),
19221                                       Mask, passThru, Subtarget, DAG);
19222       }
19223       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19224                                   Mask, passThru, Subtarget, DAG);
19225     }
19226     case INTR_TYPE_SCALAR_MASK_RM: {
19227       SDValue Src1 = Op.getOperand(1);
19228       SDValue Src2 = Op.getOperand(2);
19229       SDValue Src0 = Op.getOperand(3);
19230       SDValue Mask = Op.getOperand(4);
19231       // There are 2 kinds of intrinsics in this group:
19232       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19233       // (2) With rounding mode and sae - 7 operands.
19234       if (Op.getNumOperands() == 6) {
19235         SDValue Sae  = Op.getOperand(5);
19236         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19237                                                 Sae),
19238                                     Mask, Src0, Subtarget, DAG);
19239       }
19240       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19241       SDValue RoundingMode  = Op.getOperand(5);
19242       SDValue Sae  = Op.getOperand(6);
19243       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19244                                               RoundingMode, Sae),
19245                                   Mask, Src0, Subtarget, DAG);
19246     }
19247     case INTR_TYPE_2OP_MASK:
19248     case INTR_TYPE_2OP_IMM8_MASK: {
19249       SDValue Src1 = Op.getOperand(1);
19250       SDValue Src2 = Op.getOperand(2);
19251       SDValue PassThru = Op.getOperand(3);
19252       SDValue Mask = Op.getOperand(4);
19253
19254       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19255         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19256
19257       // We specify 2 possible opcodes for intrinsics with rounding modes.
19258       // First, we check if the intrinsic may have non-default rounding mode,
19259       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19260       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19261       if (IntrWithRoundingModeOpcode != 0) {
19262         SDValue Rnd = Op.getOperand(5);
19263         if (!isRoundModeCurDirection(Rnd)) {
19264           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19265                                       dl, Op.getValueType(),
19266                                       Src1, Src2, Rnd),
19267                                       Mask, PassThru, Subtarget, DAG);
19268         }
19269       }
19270       // TODO: Intrinsics should have fast-math-flags to propagate.
19271       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19272                                   Mask, PassThru, Subtarget, DAG);
19273     }
19274     case INTR_TYPE_2OP_MASK_RM: {
19275       SDValue Src1 = Op.getOperand(1);
19276       SDValue Src2 = Op.getOperand(2);
19277       SDValue PassThru = Op.getOperand(3);
19278       SDValue Mask = Op.getOperand(4);
19279       // We specify 2 possible modes for intrinsics, with/without rounding
19280       // modes.
19281       // First, we check if the intrinsic have rounding mode (6 operands),
19282       // if not, we set rounding mode to "current".
19283       SDValue Rnd;
19284       if (Op.getNumOperands() == 6)
19285         Rnd = Op.getOperand(5);
19286       else
19287         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19288       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19289                                               Src1, Src2, Rnd),
19290                                   Mask, PassThru, Subtarget, DAG);
19291     }
19292     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19293       SDValue Src1 = Op.getOperand(1);
19294       SDValue Src2 = Op.getOperand(2);
19295       SDValue Src3 = Op.getOperand(3);
19296       SDValue PassThru = Op.getOperand(4);
19297       SDValue Mask = Op.getOperand(5);
19298       SDValue Sae  = Op.getOperand(6);
19299
19300       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19301                                               Src2, Src3, Sae),
19302                                   Mask, PassThru, Subtarget, DAG);
19303     }
19304     case INTR_TYPE_3OP_MASK_RM: {
19305       SDValue Src1 = Op.getOperand(1);
19306       SDValue Src2 = Op.getOperand(2);
19307       SDValue Imm = Op.getOperand(3);
19308       SDValue PassThru = Op.getOperand(4);
19309       SDValue Mask = Op.getOperand(5);
19310       // We specify 2 possible modes for intrinsics, with/without rounding
19311       // modes.
19312       // First, we check if the intrinsic have rounding mode (7 operands),
19313       // if not, we set rounding mode to "current".
19314       SDValue Rnd;
19315       if (Op.getNumOperands() == 7)
19316         Rnd = Op.getOperand(6);
19317       else
19318         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19319       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19320                                               Src1, Src2, Imm, Rnd),
19321                                   Mask, PassThru, Subtarget, DAG);
19322     }
19323     case INTR_TYPE_3OP_IMM8_MASK:
19324     case INTR_TYPE_3OP_MASK: {
19325       SDValue Src1 = Op.getOperand(1);
19326       SDValue Src2 = Op.getOperand(2);
19327       SDValue Src3 = Op.getOperand(3);
19328       SDValue PassThru = Op.getOperand(4);
19329       SDValue Mask = Op.getOperand(5);
19330
19331       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19332         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19333
19334       // We specify 2 possible opcodes for intrinsics with rounding modes.
19335       // First, we check if the intrinsic may have non-default rounding mode,
19336       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19337       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19338       if (IntrWithRoundingModeOpcode != 0) {
19339         SDValue Rnd = Op.getOperand(6);
19340         if (!isRoundModeCurDirection(Rnd)) {
19341           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19342                                       dl, Op.getValueType(),
19343                                       Src1, Src2, Src3, Rnd),
19344                                       Mask, PassThru, Subtarget, DAG);
19345         }
19346       }
19347       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19348                                               Src1, Src2, Src3),
19349                                   Mask, PassThru, Subtarget, DAG);
19350     }
19351     case VPERM_2OP_MASK : {
19352       SDValue Src1 = Op.getOperand(1);
19353       SDValue Src2 = Op.getOperand(2);
19354       SDValue PassThru = Op.getOperand(3);
19355       SDValue Mask = Op.getOperand(4);
19356
19357       // Swap Src1 and Src2 in the node creation
19358       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19359                                   Mask, PassThru, Subtarget, DAG);
19360     }
19361     case VPERM_3OP_MASKZ:
19362     case VPERM_3OP_MASK:{
19363       MVT VT = Op.getSimpleValueType();
19364       // Src2 is the PassThru
19365       SDValue Src1 = Op.getOperand(1);
19366       // PassThru needs to be the same type as the destination in order
19367       // to pattern match correctly.
19368       SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19369       SDValue Src3 = Op.getOperand(3);
19370       SDValue Mask = Op.getOperand(4);
19371       SDValue PassThru = SDValue();
19372
19373       // set PassThru element
19374       if (IntrData->Type == VPERM_3OP_MASKZ)
19375         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19376       else
19377         PassThru = Src2;
19378
19379       // Swap Src1 and Src2 in the node creation
19380       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19381                                               dl, Op.getValueType(),
19382                                               Src2, Src1, Src3),
19383                                   Mask, PassThru, Subtarget, DAG);
19384     }
19385     case FMA_OP_MASK3:
19386     case FMA_OP_MASKZ:
19387     case FMA_OP_MASK: {
19388       SDValue Src1 = Op.getOperand(1);
19389       SDValue Src2 = Op.getOperand(2);
19390       SDValue Src3 = Op.getOperand(3);
19391       SDValue Mask = Op.getOperand(4);
19392       MVT VT = Op.getSimpleValueType();
19393       SDValue PassThru = SDValue();
19394
19395       // set PassThru element
19396       if (IntrData->Type == FMA_OP_MASKZ)
19397         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19398       else if (IntrData->Type == FMA_OP_MASK3)
19399         PassThru = Src3;
19400       else
19401         PassThru = Src1;
19402
19403       // We specify 2 possible opcodes for intrinsics with rounding modes.
19404       // First, we check if the intrinsic may have non-default rounding mode,
19405       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19406       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19407       if (IntrWithRoundingModeOpcode != 0) {
19408         SDValue Rnd = Op.getOperand(5);
19409         if (!isRoundModeCurDirection(Rnd))
19410           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19411                                                   dl, Op.getValueType(),
19412                                                   Src1, Src2, Src3, Rnd),
19413                                       Mask, PassThru, Subtarget, DAG);
19414       }
19415       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19416                                               dl, Op.getValueType(),
19417                                               Src1, Src2, Src3),
19418                                   Mask, PassThru, Subtarget, DAG);
19419     }
19420     case FMA_OP_SCALAR_MASK:
19421     case FMA_OP_SCALAR_MASK3:
19422     case FMA_OP_SCALAR_MASKZ: {
19423       SDValue Src1 = Op.getOperand(1);
19424       SDValue Src2 = Op.getOperand(2);
19425       SDValue Src3 = Op.getOperand(3);
19426       SDValue Mask = Op.getOperand(4);
19427       MVT VT = Op.getSimpleValueType();
19428       SDValue PassThru = SDValue();
19429
19430       // set PassThru element
19431       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19432         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19433       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19434         PassThru = Src3;
19435       else
19436         PassThru = Src1;
19437
19438       SDValue Rnd = Op.getOperand(5);
19439       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19440                                               Op.getValueType(), Src1, Src2,
19441                                               Src3, Rnd),
19442                                   Mask, PassThru, Subtarget, DAG);
19443     }
19444     case TERLOG_OP_MASK:
19445     case TERLOG_OP_MASKZ: {
19446       SDValue Src1 = Op.getOperand(1);
19447       SDValue Src2 = Op.getOperand(2);
19448       SDValue Src3 = Op.getOperand(3);
19449       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19450       SDValue Mask = Op.getOperand(5);
19451       MVT VT = Op.getSimpleValueType();
19452       SDValue PassThru = Src1;
19453       // Set PassThru element.
19454       if (IntrData->Type == TERLOG_OP_MASKZ)
19455         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19456
19457       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19458                                               Src1, Src2, Src3, Src4),
19459                                   Mask, PassThru, Subtarget, DAG);
19460     }
19461     case CVTPD2PS:
19462       // ISD::FP_ROUND has a second argument that indicates if the truncation
19463       // does not change the value. Set it to 0 since it can change.
19464       return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19465                          DAG.getIntPtrConstant(0, dl));
19466     case CVTPD2PS_MASK: {
19467       SDValue Src = Op.getOperand(1);
19468       SDValue PassThru = Op.getOperand(2);
19469       SDValue Mask = Op.getOperand(3);
19470       // We add rounding mode to the Node when
19471       //   - RM Opcode is specified and
19472       //   - RM is not "current direction".
19473       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19474       if (IntrWithRoundingModeOpcode != 0) {
19475         SDValue Rnd = Op.getOperand(4);
19476         if (!isRoundModeCurDirection(Rnd)) {
19477           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19478                                       dl, Op.getValueType(),
19479                                       Src, Rnd),
19480                                       Mask, PassThru, Subtarget, DAG);
19481         }
19482       }
19483       assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19484       // ISD::FP_ROUND has a second argument that indicates if the truncation
19485       // does not change the value. Set it to 0 since it can change.
19486       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19487                                               DAG.getIntPtrConstant(0, dl)),
19488                                   Mask, PassThru, Subtarget, DAG);
19489     }
19490     case FPCLASS: {
19491       // FPclass intrinsics with mask
19492        SDValue Src1 = Op.getOperand(1);
19493        MVT VT = Src1.getSimpleValueType();
19494        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19495        SDValue Imm = Op.getOperand(2);
19496        SDValue Mask = Op.getOperand(3);
19497        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19498                                      Mask.getSimpleValueType().getSizeInBits());
19499        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19500        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19501                                                  DAG.getTargetConstant(0, dl, MaskVT),
19502                                                  Subtarget, DAG);
19503        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19504                                  DAG.getUNDEF(BitcastVT), FPclassMask,
19505                                  DAG.getIntPtrConstant(0, dl));
19506        return DAG.getBitcast(Op.getValueType(), Res);
19507     }
19508     case FPCLASSS: {
19509       SDValue Src1 = Op.getOperand(1);
19510       SDValue Imm = Op.getOperand(2);
19511       SDValue Mask = Op.getOperand(3);
19512       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19513       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19514         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19515       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19516                          DAG.getIntPtrConstant(0, dl));
19517     }
19518     case CMP_MASK:
19519     case CMP_MASK_CC: {
19520       // Comparison intrinsics with masks.
19521       // Example of transformation:
19522       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19523       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19524       // (i8 (bitcast
19525       //   (v8i1 (insert_subvector undef,
19526       //           (v2i1 (and (PCMPEQM %a, %b),
19527       //                      (extract_subvector
19528       //                         (v8i1 (bitcast %mask)), 0))), 0))))
19529       MVT VT = Op.getOperand(1).getSimpleValueType();
19530       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19531       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19532       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19533                                        Mask.getSimpleValueType().getSizeInBits());
19534       SDValue Cmp;
19535       if (IntrData->Type == CMP_MASK_CC) {
19536         SDValue CC = Op.getOperand(3);
19537         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19538         // We specify 2 possible opcodes for intrinsics with rounding modes.
19539         // First, we check if the intrinsic may have non-default rounding mode,
19540         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19541         if (IntrData->Opc1 != 0) {
19542           SDValue Rnd = Op.getOperand(5);
19543           if (!isRoundModeCurDirection(Rnd))
19544             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19545                               Op.getOperand(2), CC, Rnd);
19546         }
19547         //default rounding mode
19548         if(!Cmp.getNode())
19549             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19550                               Op.getOperand(2), CC);
19551
19552       } else {
19553         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19554         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19555                           Op.getOperand(2));
19556       }
19557       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19558                                              DAG.getTargetConstant(0, dl,
19559                                                                    MaskVT),
19560                                              Subtarget, DAG);
19561       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19562                                 DAG.getUNDEF(BitcastVT), CmpMask,
19563                                 DAG.getIntPtrConstant(0, dl));
19564       return DAG.getBitcast(Op.getValueType(), Res);
19565     }
19566     case CMP_MASK_SCALAR_CC: {
19567       SDValue Src1 = Op.getOperand(1);
19568       SDValue Src2 = Op.getOperand(2);
19569       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19570       SDValue Mask = Op.getOperand(4);
19571
19572       SDValue Cmp;
19573       if (IntrData->Opc1 != 0) {
19574         SDValue Rnd = Op.getOperand(5);
19575         if (!isRoundModeCurDirection(Rnd))
19576           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19577       }
19578       //default rounding mode
19579       if(!Cmp.getNode())
19580         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19581
19582       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19583                                              DAG.getTargetConstant(0, dl,
19584                                                                    MVT::i1),
19585                                              Subtarget, DAG);
19586       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19587                          DAG.getIntPtrConstant(0, dl));
19588     }
19589     case COMI: { // Comparison intrinsics
19590       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19591       SDValue LHS = Op.getOperand(1);
19592       SDValue RHS = Op.getOperand(2);
19593       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19594       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19595       SDValue SetCC;
19596       switch (CC) {
19597       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19598         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19599         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19600         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19601         break;
19602       }
19603       case ISD::SETNE: { // (ZF = 1 or PF = 1)
19604         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19605         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19606         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19607         break;
19608       }
19609       case ISD::SETGT: // (CF = 0 and ZF = 0)
19610         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19611         break;
19612       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19613         SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19614         break;
19615       }
19616       case ISD::SETGE: // CF = 0
19617         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19618         break;
19619       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19620         SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19621         break;
19622       default:
19623         llvm_unreachable("Unexpected illegal condition!");
19624       }
19625       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19626     }
19627     case COMI_RM: { // Comparison intrinsics with Sae
19628       SDValue LHS = Op.getOperand(1);
19629       SDValue RHS = Op.getOperand(2);
19630       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19631       SDValue Sae = Op.getOperand(4);
19632
19633       SDValue FCmp;
19634       if (isRoundModeCurDirection(Sae))
19635         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19636                            DAG.getConstant(CondVal, dl, MVT::i8));
19637       else
19638         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19639                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19640       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19641                          DAG.getIntPtrConstant(0, dl));
19642     }
19643     case VSHIFT:
19644       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19645                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
19646                                  DAG);
19647     case COMPRESS_EXPAND_IN_REG: {
19648       SDValue Mask = Op.getOperand(3);
19649       SDValue DataToCompress = Op.getOperand(1);
19650       SDValue PassThru = Op.getOperand(2);
19651       if (isAllOnesConstant(Mask)) // return data as is
19652         return Op.getOperand(1);
19653
19654       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19655                                               DataToCompress),
19656                                   Mask, PassThru, Subtarget, DAG);
19657     }
19658     case BROADCASTM: {
19659       SDValue Mask = Op.getOperand(1);
19660       MVT MaskVT = MVT::getVectorVT(MVT::i1,
19661                                     Mask.getSimpleValueType().getSizeInBits());
19662       Mask = DAG.getBitcast(MaskVT, Mask);
19663       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19664     }
19665     case KUNPCK: {
19666       MVT VT = Op.getSimpleValueType();
19667       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19668
19669       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19670       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19671       // Arguments should be swapped.
19672       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19673                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19674                                 Src2, Src1);
19675       return DAG.getBitcast(VT, Res);
19676     }
19677     case MASK_BINOP: {
19678       MVT VT = Op.getSimpleValueType();
19679       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19680
19681       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19682       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19683       SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19684       return DAG.getBitcast(VT, Res);
19685     }
19686     case FIXUPIMMS:
19687     case FIXUPIMMS_MASKZ:
19688     case FIXUPIMM:
19689     case FIXUPIMM_MASKZ:{
19690       SDValue Src1 = Op.getOperand(1);
19691       SDValue Src2 = Op.getOperand(2);
19692       SDValue Src3 = Op.getOperand(3);
19693       SDValue Imm = Op.getOperand(4);
19694       SDValue Mask = Op.getOperand(5);
19695       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19696                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19697       // We specify 2 possible modes for intrinsics, with/without rounding
19698       // modes.
19699       // First, we check if the intrinsic have rounding mode (7 operands),
19700       // if not, we set rounding mode to "current".
19701       SDValue Rnd;
19702       if (Op.getNumOperands() == 7)
19703         Rnd = Op.getOperand(6);
19704       else
19705         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19706       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19707         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19708                                                 Src1, Src2, Src3, Imm, Rnd),
19709                                     Mask, Passthru, Subtarget, DAG);
19710       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19711         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19712                                        Src1, Src2, Src3, Imm, Rnd),
19713                                     Mask, Passthru, Subtarget, DAG);
19714     }
19715     case CONVERT_TO_MASK: {
19716       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19717       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19718       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19719
19720       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19721                                     Op.getOperand(1));
19722       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19723                                 DAG.getUNDEF(BitcastVT), CvtMask,
19724                                 DAG.getIntPtrConstant(0, dl));
19725       return DAG.getBitcast(Op.getValueType(), Res);
19726     }
19727     case BRCST_SUBVEC_TO_VEC: {
19728       SDValue Src = Op.getOperand(1);
19729       SDValue Passthru = Op.getOperand(2);
19730       SDValue Mask = Op.getOperand(3);
19731       EVT resVT = Passthru.getValueType();
19732       SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19733                                        DAG.getUNDEF(resVT), Src,
19734                                        DAG.getIntPtrConstant(0, dl));
19735       SDValue immVal;
19736       if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19737         immVal = DAG.getConstant(0x44, dl, MVT::i8);
19738       else
19739         immVal = DAG.getConstant(0, dl, MVT::i8);
19740       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19741                                               subVec, subVec, immVal),
19742                                   Mask, Passthru, Subtarget, DAG);
19743     }
19744     case BRCST32x2_TO_VEC: {
19745       SDValue Src = Op.getOperand(1);
19746       SDValue PassThru = Op.getOperand(2);
19747       SDValue Mask = Op.getOperand(3);
19748
19749       assert((VT.getScalarType() == MVT::i32 ||
19750               VT.getScalarType() == MVT::f32) && "Unexpected type!");
19751       //bitcast Src to packed 64
19752       MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19753       MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19754       Src = DAG.getBitcast(BitcastVT, Src);
19755
19756       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19757                                   Mask, PassThru, Subtarget, DAG);
19758     }
19759     default:
19760       break;
19761     }
19762   }
19763
19764   switch (IntNo) {
19765   default: return SDValue();    // Don't custom lower most intrinsics.
19766
19767   case Intrinsic::x86_avx2_permd:
19768   case Intrinsic::x86_avx2_permps:
19769     // Operands intentionally swapped. Mask is last operand to intrinsic,
19770     // but second operand for node/instruction.
19771     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19772                        Op.getOperand(2), Op.getOperand(1));
19773
19774   // ptest and testp intrinsics. The intrinsic these come from are designed to
19775   // return an integer value, not just an instruction so lower it to the ptest
19776   // or testp pattern and a setcc for the result.
19777   case Intrinsic::x86_sse41_ptestz:
19778   case Intrinsic::x86_sse41_ptestc:
19779   case Intrinsic::x86_sse41_ptestnzc:
19780   case Intrinsic::x86_avx_ptestz_256:
19781   case Intrinsic::x86_avx_ptestc_256:
19782   case Intrinsic::x86_avx_ptestnzc_256:
19783   case Intrinsic::x86_avx_vtestz_ps:
19784   case Intrinsic::x86_avx_vtestc_ps:
19785   case Intrinsic::x86_avx_vtestnzc_ps:
19786   case Intrinsic::x86_avx_vtestz_pd:
19787   case Intrinsic::x86_avx_vtestc_pd:
19788   case Intrinsic::x86_avx_vtestnzc_pd:
19789   case Intrinsic::x86_avx_vtestz_ps_256:
19790   case Intrinsic::x86_avx_vtestc_ps_256:
19791   case Intrinsic::x86_avx_vtestnzc_ps_256:
19792   case Intrinsic::x86_avx_vtestz_pd_256:
19793   case Intrinsic::x86_avx_vtestc_pd_256:
19794   case Intrinsic::x86_avx_vtestnzc_pd_256: {
19795     bool IsTestPacked = false;
19796     X86::CondCode X86CC;
19797     switch (IntNo) {
19798     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19799     case Intrinsic::x86_avx_vtestz_ps:
19800     case Intrinsic::x86_avx_vtestz_pd:
19801     case Intrinsic::x86_avx_vtestz_ps_256:
19802     case Intrinsic::x86_avx_vtestz_pd_256:
19803       IsTestPacked = true;
19804       LLVM_FALLTHROUGH;
19805     case Intrinsic::x86_sse41_ptestz:
19806     case Intrinsic::x86_avx_ptestz_256:
19807       // ZF = 1
19808       X86CC = X86::COND_E;
19809       break;
19810     case Intrinsic::x86_avx_vtestc_ps:
19811     case Intrinsic::x86_avx_vtestc_pd:
19812     case Intrinsic::x86_avx_vtestc_ps_256:
19813     case Intrinsic::x86_avx_vtestc_pd_256:
19814       IsTestPacked = true;
19815       LLVM_FALLTHROUGH;
19816     case Intrinsic::x86_sse41_ptestc:
19817     case Intrinsic::x86_avx_ptestc_256:
19818       // CF = 1
19819       X86CC = X86::COND_B;
19820       break;
19821     case Intrinsic::x86_avx_vtestnzc_ps:
19822     case Intrinsic::x86_avx_vtestnzc_pd:
19823     case Intrinsic::x86_avx_vtestnzc_ps_256:
19824     case Intrinsic::x86_avx_vtestnzc_pd_256:
19825       IsTestPacked = true;
19826       LLVM_FALLTHROUGH;
19827     case Intrinsic::x86_sse41_ptestnzc:
19828     case Intrinsic::x86_avx_ptestnzc_256:
19829       // ZF and CF = 0
19830       X86CC = X86::COND_A;
19831       break;
19832     }
19833
19834     SDValue LHS = Op.getOperand(1);
19835     SDValue RHS = Op.getOperand(2);
19836     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19837     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19838     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19839     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19840   }
19841   case Intrinsic::x86_avx512_kortestz_w:
19842   case Intrinsic::x86_avx512_kortestc_w: {
19843     X86::CondCode X86CC =
19844         (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19845     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19846     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19847     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19848     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19849     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19850   }
19851
19852   case Intrinsic::x86_avx512_knot_w: {
19853     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19854     SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
19855     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19856     return DAG.getBitcast(MVT::i16, Res);
19857   }
19858
19859   case Intrinsic::x86_avx512_kandn_w: {
19860     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19861     // Invert LHS for the not.
19862     LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
19863                       DAG.getConstant(1, dl, MVT::v16i1));
19864     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19865     SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
19866     return DAG.getBitcast(MVT::i16, Res);
19867   }
19868
19869   case Intrinsic::x86_avx512_kxnor_w: {
19870     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19871     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19872     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
19873     // Invert result for the not.
19874     Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
19875                       DAG.getConstant(1, dl, MVT::v16i1));
19876     return DAG.getBitcast(MVT::i16, Res);
19877   }
19878
19879   case Intrinsic::x86_sse42_pcmpistria128:
19880   case Intrinsic::x86_sse42_pcmpestria128:
19881   case Intrinsic::x86_sse42_pcmpistric128:
19882   case Intrinsic::x86_sse42_pcmpestric128:
19883   case Intrinsic::x86_sse42_pcmpistrio128:
19884   case Intrinsic::x86_sse42_pcmpestrio128:
19885   case Intrinsic::x86_sse42_pcmpistris128:
19886   case Intrinsic::x86_sse42_pcmpestris128:
19887   case Intrinsic::x86_sse42_pcmpistriz128:
19888   case Intrinsic::x86_sse42_pcmpestriz128: {
19889     unsigned Opcode;
19890     X86::CondCode X86CC;
19891     switch (IntNo) {
19892     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
19893     case Intrinsic::x86_sse42_pcmpistria128:
19894       Opcode = X86ISD::PCMPISTRI;
19895       X86CC = X86::COND_A;
19896       break;
19897     case Intrinsic::x86_sse42_pcmpestria128:
19898       Opcode = X86ISD::PCMPESTRI;
19899       X86CC = X86::COND_A;
19900       break;
19901     case Intrinsic::x86_sse42_pcmpistric128:
19902       Opcode = X86ISD::PCMPISTRI;
19903       X86CC = X86::COND_B;
19904       break;
19905     case Intrinsic::x86_sse42_pcmpestric128:
19906       Opcode = X86ISD::PCMPESTRI;
19907       X86CC = X86::COND_B;
19908       break;
19909     case Intrinsic::x86_sse42_pcmpistrio128:
19910       Opcode = X86ISD::PCMPISTRI;
19911       X86CC = X86::COND_O;
19912       break;
19913     case Intrinsic::x86_sse42_pcmpestrio128:
19914       Opcode = X86ISD::PCMPESTRI;
19915       X86CC = X86::COND_O;
19916       break;
19917     case Intrinsic::x86_sse42_pcmpistris128:
19918       Opcode = X86ISD::PCMPISTRI;
19919       X86CC = X86::COND_S;
19920       break;
19921     case Intrinsic::x86_sse42_pcmpestris128:
19922       Opcode = X86ISD::PCMPESTRI;
19923       X86CC = X86::COND_S;
19924       break;
19925     case Intrinsic::x86_sse42_pcmpistriz128:
19926       Opcode = X86ISD::PCMPISTRI;
19927       X86CC = X86::COND_E;
19928       break;
19929     case Intrinsic::x86_sse42_pcmpestriz128:
19930       Opcode = X86ISD::PCMPESTRI;
19931       X86CC = X86::COND_E;
19932       break;
19933     }
19934     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19935     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19936     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19937     SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19938     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19939   }
19940
19941   case Intrinsic::x86_sse42_pcmpistri128:
19942   case Intrinsic::x86_sse42_pcmpestri128: {
19943     unsigned Opcode;
19944     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19945       Opcode = X86ISD::PCMPISTRI;
19946     else
19947       Opcode = X86ISD::PCMPESTRI;
19948
19949     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19950     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19951     return DAG.getNode(Opcode, dl, VTs, NewOps);
19952   }
19953
19954   case Intrinsic::eh_sjlj_lsda: {
19955     MachineFunction &MF = DAG.getMachineFunction();
19956     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19957     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19958     auto &Context = MF.getMMI().getContext();
19959     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19960                                             Twine(MF.getFunctionNumber()));
19961     return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19962   }
19963
19964   case Intrinsic::x86_seh_lsda: {
19965     // Compute the symbol for the LSDA. We know it'll get emitted later.
19966     MachineFunction &MF = DAG.getMachineFunction();
19967     SDValue Op1 = Op.getOperand(1);
19968     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19969     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19970         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19971
19972     // Generate a simple absolute symbol reference. This intrinsic is only
19973     // supported on 32-bit Windows, which isn't PIC.
19974     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19975     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19976   }
19977
19978   case Intrinsic::x86_seh_recoverfp: {
19979     SDValue FnOp = Op.getOperand(1);
19980     SDValue IncomingFPOp = Op.getOperand(2);
19981     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19982     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19983     if (!Fn)
19984       report_fatal_error(
19985           "llvm.x86.seh.recoverfp must take a function as the first argument");
19986     return recoverFramePointer(DAG, Fn, IncomingFPOp);
19987   }
19988
19989   case Intrinsic::localaddress: {
19990     // Returns one of the stack, base, or frame pointer registers, depending on
19991     // which is used to reference local variables.
19992     MachineFunction &MF = DAG.getMachineFunction();
19993     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19994     unsigned Reg;
19995     if (RegInfo->hasBasePointer(MF))
19996       Reg = RegInfo->getBaseRegister();
19997     else // This function handles the SP or FP case.
19998       Reg = RegInfo->getPtrSizedFrameRegister(MF);
19999     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20000   }
20001   }
20002 }
20003
20004 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20005                                  SDValue Src, SDValue Mask, SDValue Base,
20006                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
20007                                  const X86Subtarget &Subtarget) {
20008   SDLoc dl(Op);
20009   auto *C = cast<ConstantSDNode>(ScaleOp);
20010   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20011   EVT MaskVT = Mask.getValueType();
20012   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20013   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20014   SDValue Segment = DAG.getRegister(0, MVT::i32);
20015   // If source is undef or we know it won't be used, use a zero vector
20016   // to break register dependency.
20017   // TODO: use undef instead and let ExecutionDepsFix deal with it?
20018   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20019     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20020   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20021   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20022   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20023   return DAG.getMergeValues(RetOps, dl);
20024 }
20025
20026 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20027                               SDValue Src, SDValue Mask, SDValue Base,
20028                               SDValue Index, SDValue ScaleOp, SDValue Chain,
20029                               const X86Subtarget &Subtarget) {
20030   SDLoc dl(Op);
20031   auto *C = cast<ConstantSDNode>(ScaleOp);
20032   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20033   MVT MaskVT = MVT::getVectorVT(MVT::i1,
20034                              Index.getSimpleValueType().getVectorNumElements());
20035
20036   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20037   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20038   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20039   SDValue Segment = DAG.getRegister(0, MVT::i32);
20040   // If source is undef or we know it won't be used, use a zero vector
20041   // to break register dependency.
20042   // TODO: use undef instead and let ExecutionDepsFix deal with it?
20043   if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20044     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20045   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20046   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20047   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20048   return DAG.getMergeValues(RetOps, dl);
20049 }
20050
20051 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20052                                SDValue Src, SDValue Mask, SDValue Base,
20053                                SDValue Index, SDValue ScaleOp, SDValue Chain,
20054                                const X86Subtarget &Subtarget) {
20055   SDLoc dl(Op);
20056   auto *C = cast<ConstantSDNode>(ScaleOp);
20057   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20058   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20059   SDValue Segment = DAG.getRegister(0, MVT::i32);
20060   MVT MaskVT = MVT::getVectorVT(MVT::i1,
20061                              Index.getSimpleValueType().getVectorNumElements());
20062
20063   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20064   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20065   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20066   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20067   return SDValue(Res, 1);
20068 }
20069
20070 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20071                                SDValue Mask, SDValue Base, SDValue Index,
20072                                SDValue ScaleOp, SDValue Chain,
20073                                const X86Subtarget &Subtarget) {
20074   SDLoc dl(Op);
20075   auto *C = cast<ConstantSDNode>(ScaleOp);
20076   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20077   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20078   SDValue Segment = DAG.getRegister(0, MVT::i32);
20079   MVT MaskVT =
20080     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20081   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20082   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20083   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20084   return SDValue(Res, 0);
20085 }
20086
20087 /// Handles the lowering of builtin intrinsic that return the value
20088 /// of the extended control register.
20089 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20090                                        SelectionDAG &DAG,
20091                                        const X86Subtarget &Subtarget,
20092                                        SmallVectorImpl<SDValue> &Results) {
20093   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20094   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20095   SDValue LO, HI;
20096
20097   // The ECX register is used to select the index of the XCR register to
20098   // return.
20099   SDValue Chain =
20100       DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20101   SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20102   Chain = SDValue(N1, 0);
20103
20104   // Reads the content of XCR and returns it in registers EDX:EAX.
20105   if (Subtarget.is64Bit()) {
20106     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20107     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20108                             LO.getValue(2));
20109   } else {
20110     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20111     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20112                             LO.getValue(2));
20113   }
20114   Chain = HI.getValue(1);
20115
20116   if (Subtarget.is64Bit()) {
20117     // Merge the two 32-bit values into a 64-bit one..
20118     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20119                               DAG.getConstant(32, DL, MVT::i8));
20120     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20121     Results.push_back(Chain);
20122     return;
20123   }
20124
20125   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20126   SDValue Ops[] = { LO, HI };
20127   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20128   Results.push_back(Pair);
20129   Results.push_back(Chain);
20130 }
20131
20132 /// Handles the lowering of builtin intrinsics that read performance monitor
20133 /// counters (x86_rdpmc).
20134 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20135                                       SelectionDAG &DAG,
20136                                       const X86Subtarget &Subtarget,
20137                                       SmallVectorImpl<SDValue> &Results) {
20138   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20139   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20140   SDValue LO, HI;
20141
20142   // The ECX register is used to select the index of the performance counter
20143   // to read.
20144   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20145                                    N->getOperand(2));
20146   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20147
20148   // Reads the content of a 64-bit performance counter and returns it in the
20149   // registers EDX:EAX.
20150   if (Subtarget.is64Bit()) {
20151     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20152     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20153                             LO.getValue(2));
20154   } else {
20155     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20156     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20157                             LO.getValue(2));
20158   }
20159   Chain = HI.getValue(1);
20160
20161   if (Subtarget.is64Bit()) {
20162     // The EAX register is loaded with the low-order 32 bits. The EDX register
20163     // is loaded with the supported high-order bits of the counter.
20164     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20165                               DAG.getConstant(32, DL, MVT::i8));
20166     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20167     Results.push_back(Chain);
20168     return;
20169   }
20170
20171   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20172   SDValue Ops[] = { LO, HI };
20173   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20174   Results.push_back(Pair);
20175   Results.push_back(Chain);
20176 }
20177
20178 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20179 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20180 /// READCYCLECOUNTER nodes.
20181 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20182                                     SelectionDAG &DAG,
20183                                     const X86Subtarget &Subtarget,
20184                                     SmallVectorImpl<SDValue> &Results) {
20185   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20186   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20187   SDValue LO, HI;
20188
20189   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20190   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20191   // and the EAX register is loaded with the low-order 32 bits.
20192   if (Subtarget.is64Bit()) {
20193     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20194     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20195                             LO.getValue(2));
20196   } else {
20197     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20198     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20199                             LO.getValue(2));
20200   }
20201   SDValue Chain = HI.getValue(1);
20202
20203   if (Opcode == X86ISD::RDTSCP_DAG) {
20204     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20205
20206     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20207     // the ECX register. Add 'ecx' explicitly to the chain.
20208     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20209                                      HI.getValue(2));
20210     // Explicitly store the content of ECX at the location passed in input
20211     // to the 'rdtscp' intrinsic.
20212     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20213                          MachinePointerInfo());
20214   }
20215
20216   if (Subtarget.is64Bit()) {
20217     // The EDX register is loaded with the high-order 32 bits of the MSR, and
20218     // the EAX register is loaded with the low-order 32 bits.
20219     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20220                               DAG.getConstant(32, DL, MVT::i8));
20221     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20222     Results.push_back(Chain);
20223     return;
20224   }
20225
20226   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20227   SDValue Ops[] = { LO, HI };
20228   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20229   Results.push_back(Pair);
20230   Results.push_back(Chain);
20231 }
20232
20233 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20234                                      SelectionDAG &DAG) {
20235   SmallVector<SDValue, 2> Results;
20236   SDLoc DL(Op);
20237   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20238                           Results);
20239   return DAG.getMergeValues(Results, DL);
20240 }
20241
20242 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20243   MachineFunction &MF = DAG.getMachineFunction();
20244   SDValue Chain = Op.getOperand(0);
20245   SDValue RegNode = Op.getOperand(2);
20246   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20247   if (!EHInfo)
20248     report_fatal_error("EH registrations only live in functions using WinEH");
20249
20250   // Cast the operand to an alloca, and remember the frame index.
20251   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20252   if (!FINode)
20253     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20254   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20255
20256   // Return the chain operand without making any DAG nodes.
20257   return Chain;
20258 }
20259
20260 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20261   MachineFunction &MF = DAG.getMachineFunction();
20262   SDValue Chain = Op.getOperand(0);
20263   SDValue EHGuard = Op.getOperand(2);
20264   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20265   if (!EHInfo)
20266     report_fatal_error("EHGuard only live in functions using WinEH");
20267
20268   // Cast the operand to an alloca, and remember the frame index.
20269   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20270   if (!FINode)
20271     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20272   EHInfo->EHGuardFrameIndex = FINode->getIndex();
20273
20274   // Return the chain operand without making any DAG nodes.
20275   return Chain;
20276 }
20277
20278 /// Emit Truncating Store with signed or unsigned saturation.
20279 static SDValue
20280 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20281                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20282                 SelectionDAG &DAG) {
20283
20284   SDVTList VTs = DAG.getVTList(MVT::Other);
20285   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20286   SDValue Ops[] = { Chain, Val, Ptr, Undef };
20287   return SignedSat ?
20288     DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20289     DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20290 }
20291
20292 /// Emit Masked Truncating Store with signed or unsigned saturation.
20293 static SDValue
20294 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20295                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20296                       MachineMemOperand *MMO, SelectionDAG &DAG) {
20297
20298   SDVTList VTs = DAG.getVTList(MVT::Other);
20299   SDValue Ops[] = { Chain, Ptr, Mask, Val };
20300   return SignedSat ?
20301     DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20302     DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20303 }
20304
20305 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20306                                       SelectionDAG &DAG) {
20307   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20308
20309   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20310   if (!IntrData) {
20311     switch (IntNo) {
20312     case llvm::Intrinsic::x86_seh_ehregnode:
20313       return MarkEHRegistrationNode(Op, DAG);
20314     case llvm::Intrinsic::x86_seh_ehguard:
20315       return MarkEHGuard(Op, DAG);
20316     case llvm::Intrinsic::x86_flags_read_u32:
20317     case llvm::Intrinsic::x86_flags_read_u64:
20318     case llvm::Intrinsic::x86_flags_write_u32:
20319     case llvm::Intrinsic::x86_flags_write_u64: {
20320       // We need a frame pointer because this will get lowered to a PUSH/POP
20321       // sequence.
20322       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20323       MFI.setHasCopyImplyingStackAdjustment(true);
20324       // Don't do anything here, we will expand these intrinsics out later
20325       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20326       return SDValue();
20327     }
20328     case Intrinsic::x86_lwpins32:
20329     case Intrinsic::x86_lwpins64: {
20330       SDLoc dl(Op);
20331       SDValue Chain = Op->getOperand(0);
20332       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20333       SDValue LwpIns =
20334           DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20335                       Op->getOperand(3), Op->getOperand(4));
20336       SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20337       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20338       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20339                          LwpIns.getValue(1));
20340     }
20341     }
20342     return SDValue();
20343   }
20344
20345   SDLoc dl(Op);
20346   switch(IntrData->Type) {
20347   default: llvm_unreachable("Unknown Intrinsic Type");
20348   case RDSEED:
20349   case RDRAND: {
20350     // Emit the node with the right value type.
20351     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20352     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20353
20354     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20355     // Otherwise return the value from Rand, which is always 0, casted to i32.
20356     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20357                       DAG.getConstant(1, dl, Op->getValueType(1)),
20358                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
20359                       SDValue(Result.getNode(), 1) };
20360     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20361                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
20362                                   Ops);
20363
20364     // Return { result, isValid, chain }.
20365     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20366                        SDValue(Result.getNode(), 2));
20367   }
20368   case GATHER_AVX2: {
20369     SDValue Chain = Op.getOperand(0);
20370     SDValue Src   = Op.getOperand(2);
20371     SDValue Base  = Op.getOperand(3);
20372     SDValue Index = Op.getOperand(4);
20373     SDValue Mask  = Op.getOperand(5);
20374     SDValue Scale = Op.getOperand(6);
20375     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20376                              Scale, Chain, Subtarget);
20377   }
20378   case GATHER: {
20379   //gather(v1, mask, index, base, scale);
20380     SDValue Chain = Op.getOperand(0);
20381     SDValue Src   = Op.getOperand(2);
20382     SDValue Base  = Op.getOperand(3);
20383     SDValue Index = Op.getOperand(4);
20384     SDValue Mask  = Op.getOperand(5);
20385     SDValue Scale = Op.getOperand(6);
20386     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20387                          Chain, Subtarget);
20388   }
20389   case SCATTER: {
20390   //scatter(base, mask, index, v1, scale);
20391     SDValue Chain = Op.getOperand(0);
20392     SDValue Base  = Op.getOperand(2);
20393     SDValue Mask  = Op.getOperand(3);
20394     SDValue Index = Op.getOperand(4);
20395     SDValue Src   = Op.getOperand(5);
20396     SDValue Scale = Op.getOperand(6);
20397     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20398                           Scale, Chain, Subtarget);
20399   }
20400   case PREFETCH: {
20401     SDValue Hint = Op.getOperand(6);
20402     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20403     assert((HintVal == 2 || HintVal == 3) &&
20404            "Wrong prefetch hint in intrinsic: should be 2 or 3");
20405     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20406     SDValue Chain = Op.getOperand(0);
20407     SDValue Mask  = Op.getOperand(2);
20408     SDValue Index = Op.getOperand(3);
20409     SDValue Base  = Op.getOperand(4);
20410     SDValue Scale = Op.getOperand(5);
20411     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20412                            Subtarget);
20413   }
20414   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20415   case RDTSC: {
20416     SmallVector<SDValue, 2> Results;
20417     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20418                             Results);
20419     return DAG.getMergeValues(Results, dl);
20420   }
20421   // Read Performance Monitoring Counters.
20422   case RDPMC: {
20423     SmallVector<SDValue, 2> Results;
20424     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20425     return DAG.getMergeValues(Results, dl);
20426   }
20427   // Get Extended Control Register.
20428   case XGETBV: {
20429     SmallVector<SDValue, 2> Results;
20430     getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20431     return DAG.getMergeValues(Results, dl);
20432   }
20433   // XTEST intrinsics.
20434   case XTEST: {
20435     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20436     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20437
20438     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20439     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20440     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20441                        Ret, SDValue(InTrans.getNode(), 1));
20442   }
20443   // ADC/ADCX/SBB
20444   case ADX: {
20445     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20446     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20447     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20448                                 DAG.getConstant(-1, dl, MVT::i8));
20449     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20450                               Op.getOperand(4), GenCF.getValue(1));
20451     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20452                                  Op.getOperand(5), MachinePointerInfo());
20453     SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20454     SDValue Results[] = { SetCC, Store };
20455     return DAG.getMergeValues(Results, dl);
20456   }
20457   case COMPRESS_TO_MEM: {
20458     SDValue Mask = Op.getOperand(4);
20459     SDValue DataToCompress = Op.getOperand(3);
20460     SDValue Addr = Op.getOperand(2);
20461     SDValue Chain = Op.getOperand(0);
20462     MVT VT = DataToCompress.getSimpleValueType();
20463
20464     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20465     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20466
20467     if (isAllOnesConstant(Mask)) // return just a store
20468       return DAG.getStore(Chain, dl, DataToCompress, Addr,
20469                           MemIntr->getMemOperand());
20470
20471     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20472     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20473
20474     return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20475                               MemIntr->getMemOperand(),
20476                               false /* truncating */, true /* compressing */);
20477   }
20478   case TRUNCATE_TO_MEM_VI8:
20479   case TRUNCATE_TO_MEM_VI16:
20480   case TRUNCATE_TO_MEM_VI32: {
20481     SDValue Mask = Op.getOperand(4);
20482     SDValue DataToTruncate = Op.getOperand(3);
20483     SDValue Addr = Op.getOperand(2);
20484     SDValue Chain = Op.getOperand(0);
20485
20486     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20487     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20488
20489     EVT MemVT  = MemIntr->getMemoryVT();
20490
20491     uint16_t TruncationOp = IntrData->Opc0;
20492     switch (TruncationOp) {
20493     case X86ISD::VTRUNC: {
20494       if (isAllOnesConstant(Mask)) // return just a truncate store
20495         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20496                                  MemIntr->getMemOperand());
20497
20498       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20499       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20500
20501       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20502                                 MemIntr->getMemOperand(), true /* truncating */);
20503     }
20504     case X86ISD::VTRUNCUS:
20505     case X86ISD::VTRUNCS: {
20506       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20507       if (isAllOnesConstant(Mask))
20508         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20509                                MemIntr->getMemOperand(), DAG);
20510
20511       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20512       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20513
20514       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20515                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
20516     }
20517     default:
20518       llvm_unreachable("Unsupported truncstore intrinsic");
20519     }
20520   }
20521
20522   case EXPAND_FROM_MEM: {
20523     SDValue Mask = Op.getOperand(4);
20524     SDValue PassThru = Op.getOperand(3);
20525     SDValue Addr = Op.getOperand(2);
20526     SDValue Chain = Op.getOperand(0);
20527     MVT VT = Op.getSimpleValueType();
20528
20529     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20530     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20531
20532     if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20533       return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20534     if (X86::isZeroNode(Mask))
20535       return DAG.getUNDEF(VT);
20536
20537     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20538     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20539     return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20540                              MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20541                              true /* expanding */);
20542   }
20543   }
20544 }
20545
20546 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20547                                            SelectionDAG &DAG) const {
20548   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20549   MFI.setReturnAddressIsTaken(true);
20550
20551   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20552     return SDValue();
20553
20554   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20555   SDLoc dl(Op);
20556   EVT PtrVT = getPointerTy(DAG.getDataLayout());
20557
20558   if (Depth > 0) {
20559     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20560     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20561     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20562     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20563                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20564                        MachinePointerInfo());
20565   }
20566
20567   // Just load the return address.
20568   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20569   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20570                      MachinePointerInfo());
20571 }
20572
20573 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20574                                                  SelectionDAG &DAG) const {
20575   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20576   return getReturnAddressFrameIndex(DAG);
20577 }
20578
20579 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20580   MachineFunction &MF = DAG.getMachineFunction();
20581   MachineFrameInfo &MFI = MF.getFrameInfo();
20582   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20583   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20584   EVT VT = Op.getValueType();
20585
20586   MFI.setFrameAddressIsTaken(true);
20587
20588   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20589     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
20590     // is not possible to crawl up the stack without looking at the unwind codes
20591     // simultaneously.
20592     int FrameAddrIndex = FuncInfo->getFAIndex();
20593     if (!FrameAddrIndex) {
20594       // Set up a frame object for the return address.
20595       unsigned SlotSize = RegInfo->getSlotSize();
20596       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20597           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20598       FuncInfo->setFAIndex(FrameAddrIndex);
20599     }
20600     return DAG.getFrameIndex(FrameAddrIndex, VT);
20601   }
20602
20603   unsigned FrameReg =
20604       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20605   SDLoc dl(Op);  // FIXME probably not meaningful
20606   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20607   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20608           (FrameReg == X86::EBP && VT == MVT::i32)) &&
20609          "Invalid Frame Register!");
20610   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20611   while (Depth--)
20612     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20613                             MachinePointerInfo());
20614   return FrameAddr;
20615 }
20616
20617 // FIXME? Maybe this could be a TableGen attribute on some registers and
20618 // this table could be generated automatically from RegInfo.
20619 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20620                                               SelectionDAG &DAG) const {
20621   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20622   const MachineFunction &MF = DAG.getMachineFunction();
20623
20624   unsigned Reg = StringSwitch<unsigned>(RegName)
20625                        .Case("esp", X86::ESP)
20626                        .Case("rsp", X86::RSP)
20627                        .Case("ebp", X86::EBP)
20628                        .Case("rbp", X86::RBP)
20629                        .Default(0);
20630
20631   if (Reg == X86::EBP || Reg == X86::RBP) {
20632     if (!TFI.hasFP(MF))
20633       report_fatal_error("register " + StringRef(RegName) +
20634                          " is allocatable: function has no frame pointer");
20635 #ifndef NDEBUG
20636     else {
20637       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20638       unsigned FrameReg =
20639           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20640       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20641              "Invalid Frame Register!");
20642     }
20643 #endif
20644   }
20645
20646   if (Reg)
20647     return Reg;
20648
20649   report_fatal_error("Invalid register name global variable");
20650 }
20651
20652 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20653                                                      SelectionDAG &DAG) const {
20654   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20655   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20656 }
20657
20658 unsigned X86TargetLowering::getExceptionPointerRegister(
20659     const Constant *PersonalityFn) const {
20660   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20661     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20662
20663   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20664 }
20665
20666 unsigned X86TargetLowering::getExceptionSelectorRegister(
20667     const Constant *PersonalityFn) const {
20668   // Funclet personalities don't use selectors (the runtime does the selection).
20669   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20670   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20671 }
20672
20673 bool X86TargetLowering::needsFixedCatchObjects() const {
20674   return Subtarget.isTargetWin64();
20675 }
20676
20677 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20678   SDValue Chain     = Op.getOperand(0);
20679   SDValue Offset    = Op.getOperand(1);
20680   SDValue Handler   = Op.getOperand(2);
20681   SDLoc dl      (Op);
20682
20683   EVT PtrVT = getPointerTy(DAG.getDataLayout());
20684   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20685   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20686   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20687           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20688          "Invalid Frame Register!");
20689   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20690   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20691
20692   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20693                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20694                                                        dl));
20695   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20696   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20697   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20698
20699   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20700                      DAG.getRegister(StoreAddrReg, PtrVT));
20701 }
20702
20703 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20704                                                SelectionDAG &DAG) const {
20705   SDLoc DL(Op);
20706   // If the subtarget is not 64bit, we may need the global base reg
20707   // after isel expand pseudo, i.e., after CGBR pass ran.
20708   // Therefore, ask for the GlobalBaseReg now, so that the pass
20709   // inserts the code for us in case we need it.
20710   // Otherwise, we will end up in a situation where we will
20711   // reference a virtual register that is not defined!
20712   if (!Subtarget.is64Bit()) {
20713     const X86InstrInfo *TII = Subtarget.getInstrInfo();
20714     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20715   }
20716   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20717                      DAG.getVTList(MVT::i32, MVT::Other),
20718                      Op.getOperand(0), Op.getOperand(1));
20719 }
20720
20721 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20722                                                 SelectionDAG &DAG) const {
20723   SDLoc DL(Op);
20724   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20725                      Op.getOperand(0), Op.getOperand(1));
20726 }
20727
20728 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20729                                                        SelectionDAG &DAG) const {
20730   SDLoc DL(Op);
20731   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20732                      Op.getOperand(0));
20733 }
20734
20735 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20736   return Op.getOperand(0);
20737 }
20738
20739 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20740                                                 SelectionDAG &DAG) const {
20741   SDValue Root = Op.getOperand(0);
20742   SDValue Trmp = Op.getOperand(1); // trampoline
20743   SDValue FPtr = Op.getOperand(2); // nested function
20744   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20745   SDLoc dl (Op);
20746
20747   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20748   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20749
20750   if (Subtarget.is64Bit()) {
20751     SDValue OutChains[6];
20752
20753     // Large code-model.
20754     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
20755     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20756
20757     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20758     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20759
20760     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20761
20762     // Load the pointer to the nested function into R11.
20763     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20764     SDValue Addr = Trmp;
20765     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20766                                 Addr, MachinePointerInfo(TrmpAddr));
20767
20768     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20769                        DAG.getConstant(2, dl, MVT::i64));
20770     OutChains[1] =
20771         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20772                      /* Alignment = */ 2);
20773
20774     // Load the 'nest' parameter value into R10.
20775     // R10 is specified in X86CallingConv.td
20776     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20777     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20778                        DAG.getConstant(10, dl, MVT::i64));
20779     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20780                                 Addr, MachinePointerInfo(TrmpAddr, 10));
20781
20782     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20783                        DAG.getConstant(12, dl, MVT::i64));
20784     OutChains[3] =
20785         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20786                      /* Alignment = */ 2);
20787
20788     // Jump to the nested function.
20789     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20790     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20791                        DAG.getConstant(20, dl, MVT::i64));
20792     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20793                                 Addr, MachinePointerInfo(TrmpAddr, 20));
20794
20795     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20796     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20797                        DAG.getConstant(22, dl, MVT::i64));
20798     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20799                                 Addr, MachinePointerInfo(TrmpAddr, 22));
20800
20801     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20802   } else {
20803     const Function *Func =
20804       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20805     CallingConv::ID CC = Func->getCallingConv();
20806     unsigned NestReg;
20807
20808     switch (CC) {
20809     default:
20810       llvm_unreachable("Unsupported calling convention");
20811     case CallingConv::C:
20812     case CallingConv::X86_StdCall: {
20813       // Pass 'nest' parameter in ECX.
20814       // Must be kept in sync with X86CallingConv.td
20815       NestReg = X86::ECX;
20816
20817       // Check that ECX wasn't needed by an 'inreg' parameter.
20818       FunctionType *FTy = Func->getFunctionType();
20819       const AttributeList &Attrs = Func->getAttributes();
20820
20821       if (!Attrs.isEmpty() && !Func->isVarArg()) {
20822         unsigned InRegCount = 0;
20823         unsigned Idx = 1;
20824
20825         for (FunctionType::param_iterator I = FTy->param_begin(),
20826              E = FTy->param_end(); I != E; ++I, ++Idx)
20827           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20828             auto &DL = DAG.getDataLayout();
20829             // FIXME: should only count parameters that are lowered to integers.
20830             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20831           }
20832
20833         if (InRegCount > 2) {
20834           report_fatal_error("Nest register in use - reduce number of inreg"
20835                              " parameters!");
20836         }
20837       }
20838       break;
20839     }
20840     case CallingConv::X86_FastCall:
20841     case CallingConv::X86_ThisCall:
20842     case CallingConv::Fast:
20843       // Pass 'nest' parameter in EAX.
20844       // Must be kept in sync with X86CallingConv.td
20845       NestReg = X86::EAX;
20846       break;
20847     }
20848
20849     SDValue OutChains[4];
20850     SDValue Addr, Disp;
20851
20852     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20853                        DAG.getConstant(10, dl, MVT::i32));
20854     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20855
20856     // This is storing the opcode for MOV32ri.
20857     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20858     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20859     OutChains[0] =
20860         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20861                      Trmp, MachinePointerInfo(TrmpAddr));
20862
20863     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20864                        DAG.getConstant(1, dl, MVT::i32));
20865     OutChains[1] =
20866         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20867                      /* Alignment = */ 1);
20868
20869     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20870     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20871                        DAG.getConstant(5, dl, MVT::i32));
20872     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20873                                 Addr, MachinePointerInfo(TrmpAddr, 5),
20874                                 /* Alignment = */ 1);
20875
20876     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20877                        DAG.getConstant(6, dl, MVT::i32));
20878     OutChains[3] =
20879         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20880                      /* Alignment = */ 1);
20881
20882     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20883   }
20884 }
20885
20886 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20887                                             SelectionDAG &DAG) const {
20888   /*
20889    The rounding mode is in bits 11:10 of FPSR, and has the following
20890    settings:
20891      00 Round to nearest
20892      01 Round to -inf
20893      10 Round to +inf
20894      11 Round to 0
20895
20896   FLT_ROUNDS, on the other hand, expects the following:
20897     -1 Undefined
20898      0 Round to 0
20899      1 Round to nearest
20900      2 Round to +inf
20901      3 Round to -inf
20902
20903   To perform the conversion, we do:
20904     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20905   */
20906
20907   MachineFunction &MF = DAG.getMachineFunction();
20908   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20909   unsigned StackAlignment = TFI.getStackAlignment();
20910   MVT VT = Op.getSimpleValueType();
20911   SDLoc DL(Op);
20912
20913   // Save FP Control Word to stack slot
20914   int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20915   SDValue StackSlot =
20916       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20917
20918   MachineMemOperand *MMO =
20919       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20920                               MachineMemOperand::MOStore, 2, 2);
20921
20922   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20923   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20924                                           DAG.getVTList(MVT::Other),
20925                                           Ops, MVT::i16, MMO);
20926
20927   // Load FP Control Word from stack slot
20928   SDValue CWD =
20929       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20930
20931   // Transform as necessary
20932   SDValue CWD1 =
20933     DAG.getNode(ISD::SRL, DL, MVT::i16,
20934                 DAG.getNode(ISD::AND, DL, MVT::i16,
20935                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20936                 DAG.getConstant(11, DL, MVT::i8));
20937   SDValue CWD2 =
20938     DAG.getNode(ISD::SRL, DL, MVT::i16,
20939                 DAG.getNode(ISD::AND, DL, MVT::i16,
20940                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20941                 DAG.getConstant(9, DL, MVT::i8));
20942
20943   SDValue RetVal =
20944     DAG.getNode(ISD::AND, DL, MVT::i16,
20945                 DAG.getNode(ISD::ADD, DL, MVT::i16,
20946                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20947                             DAG.getConstant(1, DL, MVT::i16)),
20948                 DAG.getConstant(3, DL, MVT::i16));
20949
20950   return DAG.getNode((VT.getSizeInBits() < 16 ?
20951                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20952 }
20953
20954 // Split an unary integer op into 2 half sized ops.
20955 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
20956   MVT VT = Op.getSimpleValueType();
20957   unsigned NumElems = VT.getVectorNumElements();
20958   unsigned SizeInBits = VT.getSizeInBits();
20959
20960   // Extract the Lo/Hi vectors
20961   SDLoc dl(Op);
20962   SDValue Src = Op.getOperand(0);
20963   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
20964   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
20965
20966   MVT EltVT = VT.getVectorElementType();
20967   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
20968   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20969                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
20970                      DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
20971 }
20972
20973 // Decompose 256-bit ops into smaller 128-bit ops.
20974 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
20975   assert(Op.getSimpleValueType().is256BitVector() &&
20976          Op.getSimpleValueType().isInteger() &&
20977          "Only handle AVX 256-bit vector integer operation");
20978   return LowerVectorIntUnary(Op, DAG);
20979 }
20980
20981 // Decompose 512-bit ops into smaller 256-bit ops.
20982 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
20983   assert(Op.getSimpleValueType().is512BitVector() &&
20984          Op.getSimpleValueType().isInteger() &&
20985          "Only handle AVX 512-bit vector integer operation");
20986   return LowerVectorIntUnary(Op, DAG);
20987 }
20988
20989 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20990 //
20991 // i8/i16 vector implemented using dword LZCNT vector instruction
20992 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20993 // split the vector, perform operation on it's Lo a Hi part and
20994 // concatenate the results.
20995 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
20996   assert(Op.getOpcode() == ISD::CTLZ);
20997   SDLoc dl(Op);
20998   MVT VT = Op.getSimpleValueType();
20999   MVT EltVT = VT.getVectorElementType();
21000   unsigned NumElems = VT.getVectorNumElements();
21001
21002   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21003           "Unsupported element type");
21004
21005   // Split vector, it's Lo and Hi parts will be handled in next iteration.
21006   if (16 < NumElems)
21007     return LowerVectorIntUnary(Op, DAG);
21008
21009   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21010   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21011           "Unsupported value type for operation");
21012
21013   // Use native supported vector instruction vplzcntd.
21014   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21015   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21016   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21017   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21018
21019   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21020 }
21021
21022 // Lower CTLZ using a PSHUFB lookup table implementation.
21023 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21024                                        const X86Subtarget &Subtarget,
21025                                        SelectionDAG &DAG) {
21026   MVT VT = Op.getSimpleValueType();
21027   int NumElts = VT.getVectorNumElements();
21028   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21029   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21030
21031   // Per-nibble leading zero PSHUFB lookup table.
21032   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21033                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21034                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21035                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21036
21037   SmallVector<SDValue, 64> LUTVec;
21038   for (int i = 0; i < NumBytes; ++i)
21039     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21040   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21041
21042   // Begin by bitcasting the input to byte vector, then split those bytes
21043   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21044   // If the hi input nibble is zero then we add both results together, otherwise
21045   // we just take the hi result (by masking the lo result to zero before the
21046   // add).
21047   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21048   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21049
21050   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21051   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21052   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21053   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21054   SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21055
21056   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21057   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21058   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21059   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21060
21061   // Merge result back from vXi8 back to VT, working on the lo/hi halves
21062   // of the current vector width in the same way we did for the nibbles.
21063   // If the upper half of the input element is zero then add the halves'
21064   // leading zero counts together, otherwise just use the upper half's.
21065   // Double the width of the result until we are at target width.
21066   while (CurrVT != VT) {
21067     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21068     int CurrNumElts = CurrVT.getVectorNumElements();
21069     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21070     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21071     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21072
21073     // Check if the upper half of the input element is zero.
21074     SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21075                                DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21076     HiZ = DAG.getBitcast(NextVT, HiZ);
21077
21078     // Move the upper/lower halves to the lower bits as we'll be extending to
21079     // NextVT. Mask the lower result to zero if HiZ is true and add the results
21080     // together.
21081     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21082     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21083     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21084     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21085     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21086     CurrVT = NextVT;
21087   }
21088
21089   return Res;
21090 }
21091
21092 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21093                                const X86Subtarget &Subtarget,
21094                                SelectionDAG &DAG) {
21095   MVT VT = Op.getSimpleValueType();
21096
21097   if (Subtarget.hasCDI())
21098     return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21099
21100   // Decompose 256-bit ops into smaller 128-bit ops.
21101   if (VT.is256BitVector() && !Subtarget.hasInt256())
21102     return Lower256IntUnary(Op, DAG);
21103
21104   // Decompose 512-bit ops into smaller 256-bit ops.
21105   if (VT.is512BitVector() && !Subtarget.hasBWI())
21106     return Lower512IntUnary(Op, DAG);
21107
21108   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21109   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21110 }
21111
21112 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21113                          SelectionDAG &DAG) {
21114   MVT VT = Op.getSimpleValueType();
21115   MVT OpVT = VT;
21116   unsigned NumBits = VT.getSizeInBits();
21117   SDLoc dl(Op);
21118   unsigned Opc = Op.getOpcode();
21119
21120   if (VT.isVector())
21121     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21122
21123   Op = Op.getOperand(0);
21124   if (VT == MVT::i8) {
21125     // Zero extend to i32 since there is not an i8 bsr.
21126     OpVT = MVT::i32;
21127     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21128   }
21129
21130   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21131   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21132   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21133
21134   if (Opc == ISD::CTLZ) {
21135     // If src is zero (i.e. bsr sets ZF), returns NumBits.
21136     SDValue Ops[] = {
21137       Op,
21138       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21139       DAG.getConstant(X86::COND_E, dl, MVT::i8),
21140       Op.getValue(1)
21141     };
21142     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21143   }
21144
21145   // Finally xor with NumBits-1.
21146   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21147                    DAG.getConstant(NumBits - 1, dl, OpVT));
21148
21149   if (VT == MVT::i8)
21150     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21151   return Op;
21152 }
21153
21154 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21155   MVT VT = Op.getSimpleValueType();
21156   unsigned NumBits = VT.getScalarSizeInBits();
21157   SDLoc dl(Op);
21158
21159   if (VT.isVector()) {
21160     SDValue N0 = Op.getOperand(0);
21161     SDValue Zero = DAG.getConstant(0, dl, VT);
21162
21163     // lsb(x) = (x & -x)
21164     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21165                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21166
21167     // cttz_undef(x) = (width - 1) - ctlz(lsb)
21168     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21169       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21170       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21171                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21172     }
21173
21174     // cttz(x) = ctpop(lsb - 1)
21175     SDValue One = DAG.getConstant(1, dl, VT);
21176     return DAG.getNode(ISD::CTPOP, dl, VT,
21177                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21178   }
21179
21180   assert(Op.getOpcode() == ISD::CTTZ &&
21181          "Only scalar CTTZ requires custom lowering");
21182
21183   // Issue a bsf (scan bits forward) which also sets EFLAGS.
21184   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21185   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21186
21187   // If src is zero (i.e. bsf sets ZF), returns NumBits.
21188   SDValue Ops[] = {
21189     Op,
21190     DAG.getConstant(NumBits, dl, VT),
21191     DAG.getConstant(X86::COND_E, dl, MVT::i8),
21192     Op.getValue(1)
21193   };
21194   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21195 }
21196
21197 /// Break a 256-bit integer operation into two new 128-bit ones and then
21198 /// concatenate the result back.
21199 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21200   MVT VT = Op.getSimpleValueType();
21201
21202   assert(VT.is256BitVector() && VT.isInteger() &&
21203          "Unsupported value type for operation");
21204
21205   unsigned NumElems = VT.getVectorNumElements();
21206   SDLoc dl(Op);
21207
21208   // Extract the LHS vectors
21209   SDValue LHS = Op.getOperand(0);
21210   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21211   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21212
21213   // Extract the RHS vectors
21214   SDValue RHS = Op.getOperand(1);
21215   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21216   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21217
21218   MVT EltVT = VT.getVectorElementType();
21219   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21220
21221   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21222                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21223                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21224 }
21225
21226 /// Break a 512-bit integer operation into two new 256-bit ones and then
21227 /// concatenate the result back.
21228 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21229   MVT VT = Op.getSimpleValueType();
21230
21231   assert(VT.is512BitVector() && VT.isInteger() &&
21232          "Unsupported value type for operation");
21233
21234   unsigned NumElems = VT.getVectorNumElements();
21235   SDLoc dl(Op);
21236
21237   // Extract the LHS vectors
21238   SDValue LHS = Op.getOperand(0);
21239   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21240   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21241
21242   // Extract the RHS vectors
21243   SDValue RHS = Op.getOperand(1);
21244   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21245   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21246
21247   MVT EltVT = VT.getVectorElementType();
21248   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21249
21250   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21251                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21252                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21253 }
21254
21255 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21256   MVT VT = Op.getSimpleValueType();
21257   if (VT.getScalarType() == MVT::i1)
21258     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21259                        Op.getOperand(0), Op.getOperand(1));
21260   assert(Op.getSimpleValueType().is256BitVector() &&
21261          Op.getSimpleValueType().isInteger() &&
21262          "Only handle AVX 256-bit vector integer operation");
21263   return Lower256IntArith(Op, DAG);
21264 }
21265
21266 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21267   assert(Op.getSimpleValueType().is256BitVector() &&
21268          Op.getSimpleValueType().isInteger() &&
21269          "Only handle AVX 256-bit vector integer operation");
21270   return Lower256IntUnary(Op, DAG);
21271 }
21272
21273 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21274   assert(Op.getSimpleValueType().is256BitVector() &&
21275          Op.getSimpleValueType().isInteger() &&
21276          "Only handle AVX 256-bit vector integer operation");
21277   return Lower256IntArith(Op, DAG);
21278 }
21279
21280 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21281                         SelectionDAG &DAG) {
21282   SDLoc dl(Op);
21283   MVT VT = Op.getSimpleValueType();
21284
21285   if (VT.getScalarType() == MVT::i1)
21286     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21287
21288   // Decompose 256-bit ops into smaller 128-bit ops.
21289   if (VT.is256BitVector() && !Subtarget.hasInt256())
21290     return Lower256IntArith(Op, DAG);
21291
21292   SDValue A = Op.getOperand(0);
21293   SDValue B = Op.getOperand(1);
21294
21295   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21296   // vector pairs, multiply and truncate.
21297   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21298     if (Subtarget.hasInt256()) {
21299       // For 512-bit vectors, split into 256-bit vectors to allow the
21300       // sign-extension to occur.
21301       if (VT == MVT::v64i8)
21302         return Lower512IntArith(Op, DAG);
21303
21304       // For 256-bit vectors, split into 128-bit vectors to allow the
21305       // sign-extension to occur. We don't need this on AVX512BW as we can
21306       // safely sign-extend to v32i16.
21307       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21308         return Lower256IntArith(Op, DAG);
21309
21310       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21311       return DAG.getNode(
21312           ISD::TRUNCATE, dl, VT,
21313           DAG.getNode(ISD::MUL, dl, ExVT,
21314                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21315                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21316     }
21317
21318     assert(VT == MVT::v16i8 &&
21319            "Pre-AVX2 support only supports v16i8 multiplication");
21320     MVT ExVT = MVT::v8i16;
21321
21322     // Extract the lo parts and sign extend to i16
21323     SDValue ALo, BLo;
21324     if (Subtarget.hasSSE41()) {
21325       ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21326       BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21327     } else {
21328       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21329                               -1, 4, -1, 5, -1, 6, -1, 7};
21330       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21331       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21332       ALo = DAG.getBitcast(ExVT, ALo);
21333       BLo = DAG.getBitcast(ExVT, BLo);
21334       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21335       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21336     }
21337
21338     // Extract the hi parts and sign extend to i16
21339     SDValue AHi, BHi;
21340     if (Subtarget.hasSSE41()) {
21341       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21342                               -1, -1, -1, -1, -1, -1, -1, -1};
21343       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21344       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21345       AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21346       BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21347     } else {
21348       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21349                               -1, 12, -1, 13, -1, 14, -1, 15};
21350       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21351       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21352       AHi = DAG.getBitcast(ExVT, AHi);
21353       BHi = DAG.getBitcast(ExVT, BHi);
21354       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21355       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21356     }
21357
21358     // Multiply, mask the lower 8bits of the lo/hi results and pack
21359     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21360     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21361     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21362     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21363     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21364   }
21365
21366   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21367   if (VT == MVT::v4i32) {
21368     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21369            "Should not custom lower when pmuldq is available!");
21370
21371     // Extract the odd parts.
21372     static const int UnpackMask[] = { 1, -1, 3, -1 };
21373     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21374     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21375
21376     // Multiply the even parts.
21377     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21378     // Now multiply odd parts.
21379     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21380
21381     Evens = DAG.getBitcast(VT, Evens);
21382     Odds = DAG.getBitcast(VT, Odds);
21383
21384     // Merge the two vectors back together with a shuffle. This expands into 2
21385     // shuffles.
21386     static const int ShufMask[] = { 0, 4, 2, 6 };
21387     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21388   }
21389
21390   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21391          "Only know how to lower V2I64/V4I64/V8I64 multiply");
21392
21393   // 32-bit vector types used for MULDQ/MULUDQ.
21394   MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21395
21396   // MULDQ returns the 64-bit result of the signed multiplication of the lower
21397   // 32-bits. We can lower with this if the sign bits stretch that far.
21398   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21399       DAG.ComputeNumSignBits(B) > 32) {
21400     return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21401                        DAG.getBitcast(MulVT, B));
21402   }
21403
21404   //  Ahi = psrlqi(a, 32);
21405   //  Bhi = psrlqi(b, 32);
21406   //
21407   //  AloBlo = pmuludq(a, b);
21408   //  AloBhi = pmuludq(a, Bhi);
21409   //  AhiBlo = pmuludq(Ahi, b);
21410   //
21411   //  Hi = psllqi(AloBhi + AhiBlo, 32);
21412   //  return AloBlo + Hi;
21413   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21414   bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21415   bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21416
21417   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21418   bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21419   bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21420
21421   // Bit cast to 32-bit vectors for MULUDQ.
21422   SDValue Alo = DAG.getBitcast(MulVT, A);
21423   SDValue Blo = DAG.getBitcast(MulVT, B);
21424
21425   SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21426
21427   // Only multiply lo/hi halves that aren't known to be zero.
21428   SDValue AloBlo = Zero;
21429   if (!ALoIsZero && !BLoIsZero)
21430     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21431
21432   SDValue AloBhi = Zero;
21433   if (!ALoIsZero && !BHiIsZero) {
21434     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21435     Bhi = DAG.getBitcast(MulVT, Bhi);
21436     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21437   }
21438
21439   SDValue AhiBlo = Zero;
21440   if (!AHiIsZero && !BLoIsZero) {
21441     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21442     Ahi = DAG.getBitcast(MulVT, Ahi);
21443     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21444   }
21445
21446   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21447   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21448
21449   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21450 }
21451
21452 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21453                          SelectionDAG &DAG) {
21454   SDLoc dl(Op);
21455   MVT VT = Op.getSimpleValueType();
21456
21457   // Decompose 256-bit ops into smaller 128-bit ops.
21458   if (VT.is256BitVector() && !Subtarget.hasInt256())
21459     return Lower256IntArith(Op, DAG);
21460
21461   // Only i8 vectors should need custom lowering after this.
21462   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21463          "Unsupported vector type");
21464
21465   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21466   // logical shift down the upper half and pack back to i8.
21467   SDValue A = Op.getOperand(0);
21468   SDValue B = Op.getOperand(1);
21469
21470   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21471   // and then ashr/lshr the upper bits down to the lower bits before multiply.
21472   unsigned Opcode = Op.getOpcode();
21473   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21474   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21475
21476   // AVX2 implementations - extend xmm subvectors to ymm.
21477   if (Subtarget.hasInt256()) {
21478     SDValue Lo = DAG.getIntPtrConstant(0, dl);
21479     SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21480
21481     if (VT == MVT::v32i8) {
21482       SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21483       SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21484       SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21485       SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21486       ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21487       BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21488       AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21489       BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21490       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21491                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21492                        DAG.getConstant(8, dl, MVT::v16i16));
21493       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21494                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21495                        DAG.getConstant(8, dl, MVT::v16i16));
21496       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21497       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21498       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
21499                             16, 17, 18, 19, 20, 21, 22, 23};
21500       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21501                             24, 25, 26, 27, 28, 29, 30, 31};
21502       return DAG.getNode(X86ISD::PACKUS, dl, VT,
21503                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21504                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21505     }
21506
21507     SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21508     SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21509     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21510     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21511                                DAG.getConstant(8, dl, MVT::v16i16));
21512     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21513     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21514     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21515   }
21516
21517   assert(VT == MVT::v16i8 &&
21518          "Pre-AVX2 support only supports v16i8 multiplication");
21519   MVT ExVT = MVT::v8i16;
21520
21521   // Extract the lo parts and zero/sign extend to i16.
21522   SDValue ALo, BLo;
21523   if (Subtarget.hasSSE41()) {
21524     ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21525     BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21526   } else {
21527     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21528                             -1, 4, -1, 5, -1, 6, -1, 7};
21529     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21530     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21531     ALo = DAG.getBitcast(ExVT, ALo);
21532     BLo = DAG.getBitcast(ExVT, BLo);
21533     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21534     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21535   }
21536
21537   // Extract the hi parts and zero/sign extend to i16.
21538   SDValue AHi, BHi;
21539   if (Subtarget.hasSSE41()) {
21540     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21541                             -1, -1, -1, -1, -1, -1, -1, -1};
21542     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21543     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21544     AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21545     BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21546   } else {
21547     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21548                             -1, 12, -1, 13, -1, 14, -1, 15};
21549     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21550     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21551     AHi = DAG.getBitcast(ExVT, AHi);
21552     BHi = DAG.getBitcast(ExVT, BHi);
21553     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21554     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21555   }
21556
21557   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21558   // pack back to v16i8.
21559   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21560   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21561   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21562   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21563   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21564 }
21565
21566 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21567   assert(Subtarget.isTargetWin64() && "Unexpected target");
21568   EVT VT = Op.getValueType();
21569   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21570          "Unexpected return type for lowering");
21571
21572   RTLIB::Libcall LC;
21573   bool isSigned;
21574   switch (Op->getOpcode()) {
21575   default: llvm_unreachable("Unexpected request for libcall!");
21576   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
21577   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
21578   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
21579   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
21580   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
21581   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21582   }
21583
21584   SDLoc dl(Op);
21585   SDValue InChain = DAG.getEntryNode();
21586
21587   TargetLowering::ArgListTy Args;
21588   TargetLowering::ArgListEntry Entry;
21589   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21590     EVT ArgVT = Op->getOperand(i).getValueType();
21591     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21592            "Unexpected argument type for lowering");
21593     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21594     Entry.Node = StackPtr;
21595     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21596                            MachinePointerInfo(), /* Alignment = */ 16);
21597     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21598     Entry.Ty = PointerType::get(ArgTy,0);
21599     Entry.IsSExt = false;
21600     Entry.IsZExt = false;
21601     Args.push_back(Entry);
21602   }
21603
21604   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21605                                          getPointerTy(DAG.getDataLayout()));
21606
21607   TargetLowering::CallLoweringInfo CLI(DAG);
21608   CLI.setDebugLoc(dl)
21609       .setChain(InChain)
21610       .setLibCallee(
21611           getLibcallCallingConv(LC),
21612           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21613           std::move(Args))
21614       .setInRegister()
21615       .setSExtResult(isSigned)
21616       .setZExtResult(!isSigned);
21617
21618   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21619   return DAG.getBitcast(VT, CallInfo.first);
21620 }
21621
21622 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21623                              SelectionDAG &DAG) {
21624   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21625   MVT VT = Op0.getSimpleValueType();
21626   SDLoc dl(Op);
21627
21628   // Decompose 256-bit ops into smaller 128-bit ops.
21629   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21630     unsigned Opcode = Op.getOpcode();
21631     unsigned NumElems = VT.getVectorNumElements();
21632     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21633     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21634     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21635     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21636     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21637     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21638     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21639     SDValue Ops[] = {
21640       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21641       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21642     };
21643     return DAG.getMergeValues(Ops, dl);
21644   }
21645
21646   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21647          (VT == MVT::v8i32 && Subtarget.hasInt256()));
21648
21649   // PMULxD operations multiply each even value (starting at 0) of LHS with
21650   // the related value of RHS and produce a widen result.
21651   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21652   // => <2 x i64> <ae|cg>
21653   //
21654   // In other word, to have all the results, we need to perform two PMULxD:
21655   // 1. one with the even values.
21656   // 2. one with the odd values.
21657   // To achieve #2, with need to place the odd values at an even position.
21658   //
21659   // Place the odd value at an even position (basically, shift all values 1
21660   // step to the left):
21661   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21662   // <a|b|c|d> => <b|undef|d|undef>
21663   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21664                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21665   // <e|f|g|h> => <f|undef|h|undef>
21666   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21667                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21668
21669   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21670   // ints.
21671   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21672   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21673   unsigned Opcode =
21674       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21675   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21676   // => <2 x i64> <ae|cg>
21677   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21678   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21679   // => <2 x i64> <bf|dh>
21680   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21681
21682   // Shuffle it back into the right order.
21683   SDValue Highs, Lows;
21684   if (VT == MVT::v8i32) {
21685     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21686     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21687     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21688     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21689   } else {
21690     const int HighMask[] = {1, 5, 3, 7};
21691     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21692     const int LowMask[] = {0, 4, 2, 6};
21693     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21694   }
21695
21696   // If we have a signed multiply but no PMULDQ fix up the high parts of a
21697   // unsigned multiply.
21698   if (IsSigned && !Subtarget.hasSSE41()) {
21699     SDValue ShAmt = DAG.getConstant(
21700         31, dl,
21701         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21702     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21703                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21704     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21705                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21706
21707     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21708     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21709   }
21710
21711   // The first result of MUL_LOHI is actually the low value, followed by the
21712   // high value.
21713   SDValue Ops[] = {Lows, Highs};
21714   return DAG.getMergeValues(Ops, dl);
21715 }
21716
21717 // Return true if the required (according to Opcode) shift-imm form is natively
21718 // supported by the Subtarget
21719 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21720                                         unsigned Opcode) {
21721   if (VT.getScalarSizeInBits() < 16)
21722     return false;
21723
21724   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21725       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21726     return true;
21727
21728   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21729                 (VT.is256BitVector() && Subtarget.hasInt256());
21730
21731   bool AShift = LShift && (Subtarget.hasAVX512() ||
21732                            (VT != MVT::v2i64 && VT != MVT::v4i64));
21733   return (Opcode == ISD::SRA) ? AShift : LShift;
21734 }
21735
21736 // The shift amount is a variable, but it is the same for all vector lanes.
21737 // These instructions are defined together with shift-immediate.
21738 static
21739 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21740                                       unsigned Opcode) {
21741   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21742 }
21743
21744 // Return true if the required (according to Opcode) variable-shift form is
21745 // natively supported by the Subtarget
21746 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21747                                     unsigned Opcode) {
21748
21749   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21750     return false;
21751
21752   // vXi16 supported only on AVX-512, BWI
21753   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21754     return false;
21755
21756   if (Subtarget.hasAVX512())
21757     return true;
21758
21759   bool LShift = VT.is128BitVector() || VT.is256BitVector();
21760   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
21761   return (Opcode == ISD::SRA) ? AShift : LShift;
21762 }
21763
21764 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21765                                          const X86Subtarget &Subtarget) {
21766   MVT VT = Op.getSimpleValueType();
21767   SDLoc dl(Op);
21768   SDValue R = Op.getOperand(0);
21769   SDValue Amt = Op.getOperand(1);
21770
21771   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21772     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21773
21774   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21775     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21776     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21777     SDValue Ex = DAG.getBitcast(ExVT, R);
21778
21779     // ashr(R, 63) === cmp_slt(R, 0)
21780     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
21781       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
21782              "Unsupported PCMPGT op");
21783       return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21784                          getZeroVector(VT, Subtarget, DAG, dl), R);
21785     }
21786
21787     if (ShiftAmt >= 32) {
21788       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21789       SDValue Upper =
21790           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21791       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21792                                                  ShiftAmt - 32, DAG);
21793       if (VT == MVT::v2i64)
21794         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21795       if (VT == MVT::v4i64)
21796         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21797                                   {9, 1, 11, 3, 13, 5, 15, 7});
21798     } else {
21799       // SRA upper i32, SHL whole i64 and select lower i32.
21800       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21801                                                  ShiftAmt, DAG);
21802       SDValue Lower =
21803           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21804       Lower = DAG.getBitcast(ExVT, Lower);
21805       if (VT == MVT::v2i64)
21806         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21807       if (VT == MVT::v4i64)
21808         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21809                                   {8, 1, 10, 3, 12, 5, 14, 7});
21810     }
21811     return DAG.getBitcast(VT, Ex);
21812   };
21813
21814   // Optimize shl/srl/sra with constant shift amount.
21815   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21816     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21817       uint64_t ShiftAmt = ShiftConst->getZExtValue();
21818
21819       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21820         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21821
21822       // i64 SRA needs to be performed as partial shifts.
21823       if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21824           Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21825         return ArithmeticShiftRight64(ShiftAmt);
21826
21827       if (VT == MVT::v16i8 ||
21828           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21829           VT == MVT::v64i8) {
21830         unsigned NumElts = VT.getVectorNumElements();
21831         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21832
21833         // Simple i8 add case
21834         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21835           return DAG.getNode(ISD::ADD, dl, VT, R, R);
21836
21837         // ashr(R, 7)  === cmp_slt(R, 0)
21838         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21839           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21840           if (VT.is512BitVector()) {
21841             assert(VT == MVT::v64i8 && "Unexpected element type!");
21842             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21843             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21844           }
21845           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21846         }
21847
21848         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21849         if (VT == MVT::v16i8 && Subtarget.hasXOP())
21850           return SDValue();
21851
21852         if (Op.getOpcode() == ISD::SHL) {
21853           // Make a large shift.
21854           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21855                                                    R, ShiftAmt, DAG);
21856           SHL = DAG.getBitcast(VT, SHL);
21857           // Zero out the rightmost bits.
21858           return DAG.getNode(ISD::AND, dl, VT, SHL,
21859                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21860         }
21861         if (Op.getOpcode() == ISD::SRL) {
21862           // Make a large shift.
21863           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21864                                                    R, ShiftAmt, DAG);
21865           SRL = DAG.getBitcast(VT, SRL);
21866           // Zero out the leftmost bits.
21867           return DAG.getNode(ISD::AND, dl, VT, SRL,
21868                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21869         }
21870         if (Op.getOpcode() == ISD::SRA) {
21871           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21872           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21873
21874           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21875           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21876           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21877           return Res;
21878         }
21879         llvm_unreachable("Unknown shift opcode.");
21880       }
21881     }
21882   }
21883
21884   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21885   // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
21886   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21887       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21888        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21889
21890     // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
21891     unsigned SubVectorScale = 1;
21892     if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21893       SubVectorScale =
21894           Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
21895       Amt = Amt.getOperand(0);
21896     }
21897
21898     // Peek through any splat that was introduced for i64 shift vectorization.
21899     int SplatIndex = -1;
21900     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21901       if (SVN->isSplat()) {
21902         SplatIndex = SVN->getSplatIndex();
21903         Amt = Amt.getOperand(0);
21904         assert(SplatIndex < (int)VT.getVectorNumElements() &&
21905                "Splat shuffle referencing second operand");
21906       }
21907
21908     if (Amt.getOpcode() != ISD::BITCAST ||
21909         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21910       return SDValue();
21911
21912     Amt = Amt.getOperand(0);
21913     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21914                      (SubVectorScale * VT.getVectorNumElements());
21915     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21916     uint64_t ShiftAmt = 0;
21917     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21918     for (unsigned i = 0; i != Ratio; ++i) {
21919       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21920       if (!C)
21921         return SDValue();
21922       // 6 == Log2(64)
21923       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21924     }
21925
21926     // Check remaining shift amounts (if not a splat).
21927     if (SplatIndex < 0) {
21928       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21929         uint64_t ShAmt = 0;
21930         for (unsigned j = 0; j != Ratio; ++j) {
21931           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21932           if (!C)
21933             return SDValue();
21934           // 6 == Log2(64)
21935           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21936         }
21937         if (ShAmt != ShiftAmt)
21938           return SDValue();
21939       }
21940     }
21941
21942     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21943       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21944
21945     if (Op.getOpcode() == ISD::SRA)
21946       return ArithmeticShiftRight64(ShiftAmt);
21947   }
21948
21949   return SDValue();
21950 }
21951
21952 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21953                                         const X86Subtarget &Subtarget) {
21954   MVT VT = Op.getSimpleValueType();
21955   SDLoc dl(Op);
21956   SDValue R = Op.getOperand(0);
21957   SDValue Amt = Op.getOperand(1);
21958
21959   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21960     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21961
21962   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21963     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21964
21965   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21966     SDValue BaseShAmt;
21967     MVT EltVT = VT.getVectorElementType();
21968
21969     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21970       // Check if this build_vector node is doing a splat.
21971       // If so, then set BaseShAmt equal to the splat value.
21972       BaseShAmt = BV->getSplatValue();
21973       if (BaseShAmt && BaseShAmt.isUndef())
21974         BaseShAmt = SDValue();
21975     } else {
21976       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21977         Amt = Amt.getOperand(0);
21978
21979       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21980       if (SVN && SVN->isSplat()) {
21981         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21982         SDValue InVec = Amt.getOperand(0);
21983         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21984           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21985                  "Unexpected shuffle index found!");
21986           BaseShAmt = InVec.getOperand(SplatIdx);
21987         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21988            if (ConstantSDNode *C =
21989                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21990              if (C->getZExtValue() == SplatIdx)
21991                BaseShAmt = InVec.getOperand(1);
21992            }
21993         }
21994
21995         if (!BaseShAmt)
21996           // Avoid introducing an extract element from a shuffle.
21997           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21998                                   DAG.getIntPtrConstant(SplatIdx, dl));
21999       }
22000     }
22001
22002     if (BaseShAmt.getNode()) {
22003       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22004       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22005         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22006       else if (EltVT.bitsLT(MVT::i32))
22007         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22008
22009       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22010     }
22011   }
22012
22013   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22014   if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
22015       Amt.getOpcode() == ISD::BITCAST &&
22016       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22017     Amt = Amt.getOperand(0);
22018     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22019                      VT.getVectorNumElements();
22020     std::vector<SDValue> Vals(Ratio);
22021     for (unsigned i = 0; i != Ratio; ++i)
22022       Vals[i] = Amt.getOperand(i);
22023     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22024       for (unsigned j = 0; j != Ratio; ++j)
22025         if (Vals[j] != Amt.getOperand(i + j))
22026           return SDValue();
22027     }
22028
22029     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22030       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22031   }
22032   return SDValue();
22033 }
22034
22035 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22036                           SelectionDAG &DAG) {
22037   MVT VT = Op.getSimpleValueType();
22038   SDLoc dl(Op);
22039   SDValue R = Op.getOperand(0);
22040   SDValue Amt = Op.getOperand(1);
22041   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22042
22043   assert(VT.isVector() && "Custom lowering only for vector shifts!");
22044   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22045
22046   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22047     return V;
22048
22049   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22050     return V;
22051
22052   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22053     return Op;
22054
22055   // XOP has 128-bit variable logical/arithmetic shifts.
22056   // +ve/-ve Amt = shift left/right.
22057   if (Subtarget.hasXOP() &&
22058       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22059        VT == MVT::v8i16 || VT == MVT::v16i8)) {
22060     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22061       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22062       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22063     }
22064     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22065       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22066     if (Op.getOpcode() == ISD::SRA)
22067       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22068   }
22069
22070   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22071   // shifts per-lane and then shuffle the partial results back together.
22072   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22073     // Splat the shift amounts so the scalar shifts above will catch it.
22074     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22075     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22076     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22077     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22078     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22079   }
22080
22081   // i64 vector arithmetic shift can be emulated with the transform:
22082   // M = lshr(SIGN_MASK, Amt)
22083   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22084   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22085       Op.getOpcode() == ISD::SRA) {
22086     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22087     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22088     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22089     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22090     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22091     return R;
22092   }
22093
22094   // If possible, lower this packed shift into a vector multiply instead of
22095   // expanding it into a sequence of scalar shifts.
22096   // Do this only if the vector shift count is a constant build_vector.
22097   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22098       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22099        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22100     SmallVector<SDValue, 8> Elts;
22101     MVT SVT = VT.getVectorElementType();
22102     unsigned SVTBits = SVT.getSizeInBits();
22103     APInt One(SVTBits, 1);
22104     unsigned NumElems = VT.getVectorNumElements();
22105
22106     for (unsigned i=0; i !=NumElems; ++i) {
22107       SDValue Op = Amt->getOperand(i);
22108       if (Op->isUndef()) {
22109         Elts.push_back(Op);
22110         continue;
22111       }
22112
22113       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22114       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22115       uint64_t ShAmt = C.getZExtValue();
22116       if (ShAmt >= SVTBits) {
22117         Elts.push_back(DAG.getUNDEF(SVT));
22118         continue;
22119       }
22120       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22121     }
22122     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22123     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22124   }
22125
22126   // Lower SHL with variable shift amount.
22127   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22128     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22129
22130     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22131                      DAG.getConstant(0x3f800000U, dl, VT));
22132     Op = DAG.getBitcast(MVT::v4f32, Op);
22133     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22134     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22135   }
22136
22137   // If possible, lower this shift as a sequence of two shifts by
22138   // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22139   // Example:
22140   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22141   //
22142   // Could be rewritten as:
22143   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22144   //
22145   // The advantage is that the two shifts from the example would be
22146   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22147   // the vector shift into four scalar shifts plus four pairs of vector
22148   // insert/extract.
22149   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22150     unsigned TargetOpcode = X86ISD::MOVSS;
22151     bool CanBeSimplified;
22152     // The splat value for the first packed shift (the 'X' from the example).
22153     SDValue Amt1 = Amt->getOperand(0);
22154     // The splat value for the second packed shift (the 'Y' from the example).
22155     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22156
22157     // See if it is possible to replace this node with a sequence of
22158     // two shifts followed by a MOVSS/MOVSD/PBLEND.
22159     if (VT == MVT::v4i32) {
22160       // Check if it is legal to use a MOVSS.
22161       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22162                         Amt2 == Amt->getOperand(3);
22163       if (!CanBeSimplified) {
22164         // Otherwise, check if we can still simplify this node using a MOVSD.
22165         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22166                           Amt->getOperand(2) == Amt->getOperand(3);
22167         TargetOpcode = X86ISD::MOVSD;
22168         Amt2 = Amt->getOperand(2);
22169       }
22170     } else {
22171       // Do similar checks for the case where the machine value type
22172       // is MVT::v8i16.
22173       CanBeSimplified = Amt1 == Amt->getOperand(1);
22174       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22175         CanBeSimplified = Amt2 == Amt->getOperand(i);
22176
22177       if (!CanBeSimplified) {
22178         TargetOpcode = X86ISD::MOVSD;
22179         CanBeSimplified = true;
22180         Amt2 = Amt->getOperand(4);
22181         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22182           CanBeSimplified = Amt1 == Amt->getOperand(i);
22183         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22184           CanBeSimplified = Amt2 == Amt->getOperand(j);
22185       }
22186     }
22187
22188     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22189         isa<ConstantSDNode>(Amt2)) {
22190       // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22191       MVT CastVT = MVT::v4i32;
22192       SDValue Splat1 =
22193           DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22194       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22195       SDValue Splat2 =
22196           DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22197       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22198       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22199       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22200       if (TargetOpcode == X86ISD::MOVSD)
22201         return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22202                                                        BitCast2, {0, 1, 6, 7}));
22203       return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22204                                                      BitCast2, {0, 5, 6, 7}));
22205     }
22206   }
22207
22208   // v4i32 Non Uniform Shifts.
22209   // If the shift amount is constant we can shift each lane using the SSE2
22210   // immediate shifts, else we need to zero-extend each lane to the lower i64
22211   // and shift using the SSE2 variable shifts.
22212   // The separate results can then be blended together.
22213   if (VT == MVT::v4i32) {
22214     unsigned Opc = Op.getOpcode();
22215     SDValue Amt0, Amt1, Amt2, Amt3;
22216     if (ConstantAmt) {
22217       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22218       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22219       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22220       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22221     } else {
22222       // ISD::SHL is handled above but we include it here for completeness.
22223       switch (Opc) {
22224       default:
22225         llvm_unreachable("Unknown target vector shift node");
22226       case ISD::SHL:
22227         Opc = X86ISD::VSHL;
22228         break;
22229       case ISD::SRL:
22230         Opc = X86ISD::VSRL;
22231         break;
22232       case ISD::SRA:
22233         Opc = X86ISD::VSRA;
22234         break;
22235       }
22236       // The SSE2 shifts use the lower i64 as the same shift amount for
22237       // all lanes and the upper i64 is ignored. These shuffle masks
22238       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22239       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22240       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22241       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22242       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22243       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22244     }
22245
22246     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22247     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22248     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22249     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22250     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22251     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22252     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22253   }
22254
22255   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22256   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22257   // make the existing SSE solution better.
22258   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22259       (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22260       (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22261       (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22262     MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22263     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22264     unsigned ExtOpc =
22265         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22266     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22267     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22268     return DAG.getNode(ISD::TRUNCATE, dl, VT,
22269                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22270   }
22271
22272   if (VT == MVT::v16i8 ||
22273       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22274       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22275     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22276     unsigned ShiftOpcode = Op->getOpcode();
22277
22278     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22279       if (VT.is512BitVector()) {
22280         // On AVX512BW targets we make use of the fact that VSELECT lowers
22281         // to a masked blend which selects bytes based just on the sign bit
22282         // extracted to a mask.
22283         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22284         V0 = DAG.getBitcast(VT, V0);
22285         V1 = DAG.getBitcast(VT, V1);
22286         Sel = DAG.getBitcast(VT, Sel);
22287         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22288         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22289       } else if (Subtarget.hasSSE41()) {
22290         // On SSE41 targets we make use of the fact that VSELECT lowers
22291         // to PBLENDVB which selects bytes based just on the sign bit.
22292         V0 = DAG.getBitcast(VT, V0);
22293         V1 = DAG.getBitcast(VT, V1);
22294         Sel = DAG.getBitcast(VT, Sel);
22295         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22296       }
22297       // On pre-SSE41 targets we test for the sign bit by comparing to
22298       // zero - a negative value will set all bits of the lanes to true
22299       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22300       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22301       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22302       return DAG.getSelect(dl, SelVT, C, V0, V1);
22303     };
22304
22305     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22306     // We can safely do this using i16 shifts as we're only interested in
22307     // the 3 lower bits of each byte.
22308     Amt = DAG.getBitcast(ExtVT, Amt);
22309     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22310     Amt = DAG.getBitcast(VT, Amt);
22311
22312     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22313       // r = VSELECT(r, shift(r, 4), a);
22314       SDValue M =
22315           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22316       R = SignBitSelect(VT, Amt, M, R);
22317
22318       // a += a
22319       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22320
22321       // r = VSELECT(r, shift(r, 2), a);
22322       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22323       R = SignBitSelect(VT, Amt, M, R);
22324
22325       // a += a
22326       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22327
22328       // return VSELECT(r, shift(r, 1), a);
22329       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22330       R = SignBitSelect(VT, Amt, M, R);
22331       return R;
22332     }
22333
22334     if (Op->getOpcode() == ISD::SRA) {
22335       // For SRA we need to unpack each byte to the higher byte of a i16 vector
22336       // so we can correctly sign extend. We don't care what happens to the
22337       // lower byte.
22338       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22339       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22340       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22341       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22342       ALo = DAG.getBitcast(ExtVT, ALo);
22343       AHi = DAG.getBitcast(ExtVT, AHi);
22344       RLo = DAG.getBitcast(ExtVT, RLo);
22345       RHi = DAG.getBitcast(ExtVT, RHi);
22346
22347       // r = VSELECT(r, shift(r, 4), a);
22348       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22349                                 DAG.getConstant(4, dl, ExtVT));
22350       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22351                                 DAG.getConstant(4, dl, ExtVT));
22352       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22353       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22354
22355       // a += a
22356       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22357       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22358
22359       // r = VSELECT(r, shift(r, 2), a);
22360       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22361                         DAG.getConstant(2, dl, ExtVT));
22362       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22363                         DAG.getConstant(2, dl, ExtVT));
22364       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22365       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22366
22367       // a += a
22368       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22369       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22370
22371       // r = VSELECT(r, shift(r, 1), a);
22372       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22373                         DAG.getConstant(1, dl, ExtVT));
22374       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22375                         DAG.getConstant(1, dl, ExtVT));
22376       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22377       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22378
22379       // Logical shift the result back to the lower byte, leaving a zero upper
22380       // byte
22381       // meaning that we can safely pack with PACKUSWB.
22382       RLo =
22383           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22384       RHi =
22385           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22386       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22387     }
22388   }
22389
22390   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22391     MVT ExtVT = MVT::v8i32;
22392     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22393     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22394     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22395     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22396     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22397     ALo = DAG.getBitcast(ExtVT, ALo);
22398     AHi = DAG.getBitcast(ExtVT, AHi);
22399     RLo = DAG.getBitcast(ExtVT, RLo);
22400     RHi = DAG.getBitcast(ExtVT, RHi);
22401     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22402     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22403     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22404     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22405     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22406   }
22407
22408   if (VT == MVT::v8i16) {
22409     unsigned ShiftOpcode = Op->getOpcode();
22410
22411     // If we have a constant shift amount, the non-SSE41 path is best as
22412     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22413     bool UseSSE41 = Subtarget.hasSSE41() &&
22414                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22415
22416     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22417       // On SSE41 targets we make use of the fact that VSELECT lowers
22418       // to PBLENDVB which selects bytes based just on the sign bit.
22419       if (UseSSE41) {
22420         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22421         V0 = DAG.getBitcast(ExtVT, V0);
22422         V1 = DAG.getBitcast(ExtVT, V1);
22423         Sel = DAG.getBitcast(ExtVT, Sel);
22424         return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22425       }
22426       // On pre-SSE41 targets we splat the sign bit - a negative value will
22427       // set all bits of the lanes to true and VSELECT uses that in
22428       // its OR(AND(V0,C),AND(V1,~C)) lowering.
22429       SDValue C =
22430           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22431       return DAG.getSelect(dl, VT, C, V0, V1);
22432     };
22433
22434     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22435     if (UseSSE41) {
22436       // On SSE41 targets we need to replicate the shift mask in both
22437       // bytes for PBLENDVB.
22438       Amt = DAG.getNode(
22439           ISD::OR, dl, VT,
22440           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22441           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22442     } else {
22443       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22444     }
22445
22446     // r = VSELECT(r, shift(r, 8), a);
22447     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22448     R = SignBitSelect(Amt, M, R);
22449
22450     // a += a
22451     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22452
22453     // r = VSELECT(r, shift(r, 4), a);
22454     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22455     R = SignBitSelect(Amt, M, R);
22456
22457     // a += a
22458     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22459
22460     // r = VSELECT(r, shift(r, 2), a);
22461     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22462     R = SignBitSelect(Amt, M, R);
22463
22464     // a += a
22465     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22466
22467     // return VSELECT(r, shift(r, 1), a);
22468     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22469     R = SignBitSelect(Amt, M, R);
22470     return R;
22471   }
22472
22473   // Decompose 256-bit shifts into smaller 128-bit shifts.
22474   if (VT.is256BitVector())
22475     return Lower256IntArith(Op, DAG);
22476
22477   return SDValue();
22478 }
22479
22480 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22481                            SelectionDAG &DAG) {
22482   MVT VT = Op.getSimpleValueType();
22483   SDLoc DL(Op);
22484   SDValue R = Op.getOperand(0);
22485   SDValue Amt = Op.getOperand(1);
22486
22487   assert(VT.isVector() && "Custom lowering only for vector rotates!");
22488   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22489   assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22490
22491   // XOP has 128-bit vector variable + immediate rotates.
22492   // +ve/-ve Amt = rotate left/right.
22493
22494   // Split 256-bit integers.
22495   if (VT.is256BitVector())
22496     return Lower256IntArith(Op, DAG);
22497
22498   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22499
22500   // Attempt to rotate by immediate.
22501   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22502     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22503       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22504       assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22505       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22506                          DAG.getConstant(RotateAmt, DL, MVT::i8));
22507     }
22508   }
22509
22510   // Use general rotate by variable (per-element).
22511   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22512 }
22513
22514 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22515   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22516   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22517   // looks for this combo and may remove the "setcc" instruction if the "setcc"
22518   // has only one use.
22519   SDNode *N = Op.getNode();
22520   SDValue LHS = N->getOperand(0);
22521   SDValue RHS = N->getOperand(1);
22522   unsigned BaseOp = 0;
22523   X86::CondCode Cond;
22524   SDLoc DL(Op);
22525   switch (Op.getOpcode()) {
22526   default: llvm_unreachable("Unknown ovf instruction!");
22527   case ISD::SADDO:
22528     // A subtract of one will be selected as a INC. Note that INC doesn't
22529     // set CF, so we can't do this for UADDO.
22530     if (isOneConstant(RHS)) {
22531       BaseOp = X86ISD::INC;
22532       Cond = X86::COND_O;
22533       break;
22534     }
22535     BaseOp = X86ISD::ADD;
22536     Cond = X86::COND_O;
22537     break;
22538   case ISD::UADDO:
22539     BaseOp = X86ISD::ADD;
22540     Cond = X86::COND_B;
22541     break;
22542   case ISD::SSUBO:
22543     // A subtract of one will be selected as a DEC. Note that DEC doesn't
22544     // set CF, so we can't do this for USUBO.
22545     if (isOneConstant(RHS)) {
22546       BaseOp = X86ISD::DEC;
22547       Cond = X86::COND_O;
22548       break;
22549     }
22550     BaseOp = X86ISD::SUB;
22551     Cond = X86::COND_O;
22552     break;
22553   case ISD::USUBO:
22554     BaseOp = X86ISD::SUB;
22555     Cond = X86::COND_B;
22556     break;
22557   case ISD::SMULO:
22558     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22559     Cond = X86::COND_O;
22560     break;
22561   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22562     if (N->getValueType(0) == MVT::i8) {
22563       BaseOp = X86ISD::UMUL8;
22564       Cond = X86::COND_O;
22565       break;
22566     }
22567     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22568                                  MVT::i32);
22569     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22570
22571     SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22572
22573     if (N->getValueType(1) == MVT::i1)
22574       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22575
22576     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22577   }
22578   }
22579
22580   // Also sets EFLAGS.
22581   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22582   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22583
22584   SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22585
22586   if (N->getValueType(1) == MVT::i1)
22587     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22588
22589   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22590 }
22591
22592 /// Returns true if the operand type is exactly twice the native width, and
22593 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22594 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22595 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22596 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22597   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22598
22599   if (OpWidth == 64)
22600     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22601   else if (OpWidth == 128)
22602     return Subtarget.hasCmpxchg16b();
22603   else
22604     return false;
22605 }
22606
22607 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22608   return needsCmpXchgNb(SI->getValueOperand()->getType());
22609 }
22610
22611 // Note: this turns large loads into lock cmpxchg8b/16b.
22612 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22613 TargetLowering::AtomicExpansionKind
22614 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22615   auto PTy = cast<PointerType>(LI->getPointerOperandType());
22616   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22617                                                : AtomicExpansionKind::None;
22618 }
22619
22620 TargetLowering::AtomicExpansionKind
22621 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22622   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22623   Type *MemType = AI->getType();
22624
22625   // If the operand is too big, we must see if cmpxchg8/16b is available
22626   // and default to library calls otherwise.
22627   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22628     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22629                                    : AtomicExpansionKind::None;
22630   }
22631
22632   AtomicRMWInst::BinOp Op = AI->getOperation();
22633   switch (Op) {
22634   default:
22635     llvm_unreachable("Unknown atomic operation");
22636   case AtomicRMWInst::Xchg:
22637   case AtomicRMWInst::Add:
22638   case AtomicRMWInst::Sub:
22639     // It's better to use xadd, xsub or xchg for these in all cases.
22640     return AtomicExpansionKind::None;
22641   case AtomicRMWInst::Or:
22642   case AtomicRMWInst::And:
22643   case AtomicRMWInst::Xor:
22644     // If the atomicrmw's result isn't actually used, we can just add a "lock"
22645     // prefix to a normal instruction for these operations.
22646     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22647                             : AtomicExpansionKind::None;
22648   case AtomicRMWInst::Nand:
22649   case AtomicRMWInst::Max:
22650   case AtomicRMWInst::Min:
22651   case AtomicRMWInst::UMax:
22652   case AtomicRMWInst::UMin:
22653     // These always require a non-trivial set of data operations on x86. We must
22654     // use a cmpxchg loop.
22655     return AtomicExpansionKind::CmpXChg;
22656   }
22657 }
22658
22659 LoadInst *
22660 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22661   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22662   Type *MemType = AI->getType();
22663   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22664   // there is no benefit in turning such RMWs into loads, and it is actually
22665   // harmful as it introduces a mfence.
22666   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22667     return nullptr;
22668
22669   auto Builder = IRBuilder<>(AI);
22670   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22671   auto SynchScope = AI->getSynchScope();
22672   // We must restrict the ordering to avoid generating loads with Release or
22673   // ReleaseAcquire orderings.
22674   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22675   auto Ptr = AI->getPointerOperand();
22676
22677   // Before the load we need a fence. Here is an example lifted from
22678   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22679   // is required:
22680   // Thread 0:
22681   //   x.store(1, relaxed);
22682   //   r1 = y.fetch_add(0, release);
22683   // Thread 1:
22684   //   y.fetch_add(42, acquire);
22685   //   r2 = x.load(relaxed);
22686   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22687   // lowered to just a load without a fence. A mfence flushes the store buffer,
22688   // making the optimization clearly correct.
22689   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22690   // otherwise, we might be able to be more aggressive on relaxed idempotent
22691   // rmw. In practice, they do not look useful, so we don't try to be
22692   // especially clever.
22693   if (SynchScope == SingleThread)
22694     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22695     // the IR level, so we must wrap it in an intrinsic.
22696     return nullptr;
22697
22698   if (!Subtarget.hasMFence())
22699     // FIXME: it might make sense to use a locked operation here but on a
22700     // different cache-line to prevent cache-line bouncing. In practice it
22701     // is probably a small win, and x86 processors without mfence are rare
22702     // enough that we do not bother.
22703     return nullptr;
22704
22705   Function *MFence =
22706       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22707   Builder.CreateCall(MFence, {});
22708
22709   // Finally we can emit the atomic load.
22710   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22711           AI->getType()->getPrimitiveSizeInBits());
22712   Loaded->setAtomic(Order, SynchScope);
22713   AI->replaceAllUsesWith(Loaded);
22714   AI->eraseFromParent();
22715   return Loaded;
22716 }
22717
22718 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22719                                  SelectionDAG &DAG) {
22720   SDLoc dl(Op);
22721   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22722     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22723   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22724     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22725
22726   // The only fence that needs an instruction is a sequentially-consistent
22727   // cross-thread fence.
22728   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22729       FenceScope == CrossThread) {
22730     if (Subtarget.hasMFence())
22731       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22732
22733     SDValue Chain = Op.getOperand(0);
22734     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22735     SDValue Ops[] = {
22736       DAG.getRegister(X86::ESP, MVT::i32),     // Base
22737       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
22738       DAG.getRegister(0, MVT::i32),            // Index
22739       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
22740       DAG.getRegister(0, MVT::i32),            // Segment.
22741       Zero,
22742       Chain
22743     };
22744     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22745     return SDValue(Res, 0);
22746   }
22747
22748   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22749   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22750 }
22751
22752 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22753                              SelectionDAG &DAG) {
22754   MVT T = Op.getSimpleValueType();
22755   SDLoc DL(Op);
22756   unsigned Reg = 0;
22757   unsigned size = 0;
22758   switch(T.SimpleTy) {
22759   default: llvm_unreachable("Invalid value type!");
22760   case MVT::i8:  Reg = X86::AL;  size = 1; break;
22761   case MVT::i16: Reg = X86::AX;  size = 2; break;
22762   case MVT::i32: Reg = X86::EAX; size = 4; break;
22763   case MVT::i64:
22764     assert(Subtarget.is64Bit() && "Node not type legal!");
22765     Reg = X86::RAX; size = 8;
22766     break;
22767   }
22768   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22769                                   Op.getOperand(2), SDValue());
22770   SDValue Ops[] = { cpIn.getValue(0),
22771                     Op.getOperand(1),
22772                     Op.getOperand(3),
22773                     DAG.getTargetConstant(size, DL, MVT::i8),
22774                     cpIn.getValue(1) };
22775   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22776   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22777   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22778                                            Ops, T, MMO);
22779
22780   SDValue cpOut =
22781     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22782   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22783                                       MVT::i32, cpOut.getValue(2));
22784   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22785
22786   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22787   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22788   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22789   return SDValue();
22790 }
22791
22792 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22793                             SelectionDAG &DAG) {
22794   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22795   MVT DstVT = Op.getSimpleValueType();
22796
22797   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22798       SrcVT == MVT::i64) {
22799     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22800     if (DstVT != MVT::f64)
22801       // This conversion needs to be expanded.
22802       return SDValue();
22803
22804     SDValue Op0 = Op->getOperand(0);
22805     SmallVector<SDValue, 16> Elts;
22806     SDLoc dl(Op);
22807     unsigned NumElts;
22808     MVT SVT;
22809     if (SrcVT.isVector()) {
22810       NumElts = SrcVT.getVectorNumElements();
22811       SVT = SrcVT.getVectorElementType();
22812
22813       // Widen the vector in input in the case of MVT::v2i32.
22814       // Example: from MVT::v2i32 to MVT::v4i32.
22815       for (unsigned i = 0, e = NumElts; i != e; ++i)
22816         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22817                                    DAG.getIntPtrConstant(i, dl)));
22818     } else {
22819       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22820              "Unexpected source type in LowerBITCAST");
22821       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22822                                  DAG.getIntPtrConstant(0, dl)));
22823       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22824                                  DAG.getIntPtrConstant(1, dl)));
22825       NumElts = 2;
22826       SVT = MVT::i32;
22827     }
22828     // Explicitly mark the extra elements as Undef.
22829     Elts.append(NumElts, DAG.getUNDEF(SVT));
22830
22831     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22832     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22833     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22834     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22835                        DAG.getIntPtrConstant(0, dl));
22836   }
22837
22838   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22839          Subtarget.hasMMX() && "Unexpected custom BITCAST");
22840   assert((DstVT == MVT::i64 ||
22841           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22842          "Unexpected custom BITCAST");
22843   // i64 <=> MMX conversions are Legal.
22844   if (SrcVT==MVT::i64 && DstVT.isVector())
22845     return Op;
22846   if (DstVT==MVT::i64 && SrcVT.isVector())
22847     return Op;
22848   // MMX <=> MMX conversions are Legal.
22849   if (SrcVT.isVector() && DstVT.isVector())
22850     return Op;
22851   // All other conversions need to be expanded.
22852   return SDValue();
22853 }
22854
22855 /// Compute the horizontal sum of bytes in V for the elements of VT.
22856 ///
22857 /// Requires V to be a byte vector and VT to be an integer vector type with
22858 /// wider elements than V's type. The width of the elements of VT determines
22859 /// how many bytes of V are summed horizontally to produce each element of the
22860 /// result.
22861 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22862                                       const X86Subtarget &Subtarget,
22863                                       SelectionDAG &DAG) {
22864   SDLoc DL(V);
22865   MVT ByteVecVT = V.getSimpleValueType();
22866   MVT EltVT = VT.getVectorElementType();
22867   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22868          "Expected value to have byte element type.");
22869   assert(EltVT != MVT::i8 &&
22870          "Horizontal byte sum only makes sense for wider elements!");
22871   unsigned VecSize = VT.getSizeInBits();
22872   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22873
22874   // PSADBW instruction horizontally add all bytes and leave the result in i64
22875   // chunks, thus directly computes the pop count for v2i64 and v4i64.
22876   if (EltVT == MVT::i64) {
22877     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22878     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22879     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22880     return DAG.getBitcast(VT, V);
22881   }
22882
22883   if (EltVT == MVT::i32) {
22884     // We unpack the low half and high half into i32s interleaved with zeros so
22885     // that we can use PSADBW to horizontally sum them. The most useful part of
22886     // this is that it lines up the results of two PSADBW instructions to be
22887     // two v2i64 vectors which concatenated are the 4 population counts. We can
22888     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22889     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22890     SDValue V32 = DAG.getBitcast(VT, V);
22891     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22892     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22893
22894     // Do the horizontal sums into two v2i64s.
22895     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22896     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22897     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22898                       DAG.getBitcast(ByteVecVT, Low), Zeros);
22899     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22900                        DAG.getBitcast(ByteVecVT, High), Zeros);
22901
22902     // Merge them together.
22903     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22904     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22905                     DAG.getBitcast(ShortVecVT, Low),
22906                     DAG.getBitcast(ShortVecVT, High));
22907
22908     return DAG.getBitcast(VT, V);
22909   }
22910
22911   // The only element type left is i16.
22912   assert(EltVT == MVT::i16 && "Unknown how to handle type");
22913
22914   // To obtain pop count for each i16 element starting from the pop count for
22915   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22916   // right by 8. It is important to shift as i16s as i8 vector shift isn't
22917   // directly supported.
22918   SDValue ShifterV = DAG.getConstant(8, DL, VT);
22919   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22920   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22921                   DAG.getBitcast(ByteVecVT, V));
22922   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22923 }
22924
22925 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22926                                         const X86Subtarget &Subtarget,
22927                                         SelectionDAG &DAG) {
22928   MVT VT = Op.getSimpleValueType();
22929   MVT EltVT = VT.getVectorElementType();
22930   unsigned VecSize = VT.getSizeInBits();
22931
22932   // Implement a lookup table in register by using an algorithm based on:
22933   // http://wm.ite.pl/articles/sse-popcount.html
22934   //
22935   // The general idea is that every lower byte nibble in the input vector is an
22936   // index into a in-register pre-computed pop count table. We then split up the
22937   // input vector in two new ones: (1) a vector with only the shifted-right
22938   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22939   // masked out higher ones) for each byte. PSHUFB is used separately with both
22940   // to index the in-register table. Next, both are added and the result is a
22941   // i8 vector where each element contains the pop count for input byte.
22942   //
22943   // To obtain the pop count for elements != i8, we follow up with the same
22944   // approach and use additional tricks as described below.
22945   //
22946   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22947                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22948                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22949                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22950
22951   int NumByteElts = VecSize / 8;
22952   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22953   SDValue In = DAG.getBitcast(ByteVecVT, Op);
22954   SmallVector<SDValue, 64> LUTVec;
22955   for (int i = 0; i < NumByteElts; ++i)
22956     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22957   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22958   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22959
22960   // High nibbles
22961   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22962   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22963
22964   // Low nibbles
22965   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22966
22967   // The input vector is used as the shuffle mask that index elements into the
22968   // LUT. After counting low and high nibbles, add the vector to obtain the
22969   // final pop count per i8 element.
22970   SDValue HighPopCnt =
22971       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22972   SDValue LowPopCnt =
22973       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22974   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22975
22976   if (EltVT == MVT::i8)
22977     return PopCnt;
22978
22979   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22980 }
22981
22982 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22983                                        const X86Subtarget &Subtarget,
22984                                        SelectionDAG &DAG) {
22985   MVT VT = Op.getSimpleValueType();
22986   assert(VT.is128BitVector() &&
22987          "Only 128-bit vector bitmath lowering supported.");
22988
22989   int VecSize = VT.getSizeInBits();
22990   MVT EltVT = VT.getVectorElementType();
22991   int Len = EltVT.getSizeInBits();
22992
22993   // This is the vectorized version of the "best" algorithm from
22994   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22995   // with a minor tweak to use a series of adds + shifts instead of vector
22996   // multiplications. Implemented for all integer vector types. We only use
22997   // this when we don't have SSSE3 which allows a LUT-based lowering that is
22998   // much faster, even faster than using native popcnt instructions.
22999
23000   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23001     MVT VT = V.getSimpleValueType();
23002     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23003     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23004   };
23005   auto GetMask = [&](SDValue V, APInt Mask) {
23006     MVT VT = V.getSimpleValueType();
23007     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23008     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23009   };
23010
23011   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23012   // x86, so set the SRL type to have elements at least i16 wide. This is
23013   // correct because all of our SRLs are followed immediately by a mask anyways
23014   // that handles any bits that sneak into the high bits of the byte elements.
23015   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23016
23017   SDValue V = Op;
23018
23019   // v = v - ((v >> 1) & 0x55555555...)
23020   SDValue Srl =
23021       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23022   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23023   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23024
23025   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23026   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23027   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23028   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23029   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23030
23031   // v = (v + (v >> 4)) & 0x0F0F0F0F...
23032   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23033   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23034   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23035
23036   // At this point, V contains the byte-wise population count, and we are
23037   // merely doing a horizontal sum if necessary to get the wider element
23038   // counts.
23039   if (EltVT == MVT::i8)
23040     return V;
23041
23042   return LowerHorizontalByteSum(
23043       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23044       DAG);
23045 }
23046
23047 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23048 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23049 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23050                                 SelectionDAG &DAG) {
23051   MVT VT = Op.getSimpleValueType();
23052   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23053          "Unknown CTPOP type to handle");
23054   SDLoc DL(Op.getNode());
23055   SDValue Op0 = Op.getOperand(0);
23056
23057   if (!Subtarget.hasSSSE3()) {
23058     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23059     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23060     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23061   }
23062
23063   // Decompose 256-bit ops into smaller 128-bit ops.
23064   if (VT.is256BitVector() && !Subtarget.hasInt256())
23065     return Lower256IntUnary(Op, DAG);
23066
23067   // Decompose 512-bit ops into smaller 256-bit ops.
23068   if (VT.is512BitVector() && !Subtarget.hasBWI())
23069     return Lower512IntUnary(Op, DAG);
23070
23071   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23072 }
23073
23074 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23075                           SelectionDAG &DAG) {
23076   assert(Op.getSimpleValueType().isVector() &&
23077          "We only do custom lowering for vector population count.");
23078   return LowerVectorCTPOP(Op, Subtarget, DAG);
23079 }
23080
23081 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23082   MVT VT = Op.getSimpleValueType();
23083   SDValue In = Op.getOperand(0);
23084   SDLoc DL(Op);
23085
23086   // For scalars, its still beneficial to transfer to/from the SIMD unit to
23087   // perform the BITREVERSE.
23088   if (!VT.isVector()) {
23089     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23090     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23091     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23092     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23093                        DAG.getIntPtrConstant(0, DL));
23094   }
23095
23096   int NumElts = VT.getVectorNumElements();
23097   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23098
23099   // Decompose 256-bit ops into smaller 128-bit ops.
23100   if (VT.is256BitVector())
23101     return Lower256IntUnary(Op, DAG);
23102
23103   assert(VT.is128BitVector() &&
23104          "Only 128-bit vector bitreverse lowering supported.");
23105
23106   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23107   // perform the BSWAP in the shuffle.
23108   // Its best to shuffle using the second operand as this will implicitly allow
23109   // memory folding for multiple vectors.
23110   SmallVector<SDValue, 16> MaskElts;
23111   for (int i = 0; i != NumElts; ++i) {
23112     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23113       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23114       int PermuteByte = SourceByte | (2 << 5);
23115       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23116     }
23117   }
23118
23119   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23120   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23121   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23122                     Res, Mask);
23123   return DAG.getBitcast(VT, Res);
23124 }
23125
23126 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23127                                SelectionDAG &DAG) {
23128   if (Subtarget.hasXOP())
23129     return LowerBITREVERSE_XOP(Op, DAG);
23130
23131   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23132
23133   MVT VT = Op.getSimpleValueType();
23134   SDValue In = Op.getOperand(0);
23135   SDLoc DL(Op);
23136
23137   unsigned NumElts = VT.getVectorNumElements();
23138   assert(VT.getScalarType() == MVT::i8 &&
23139          "Only byte vector BITREVERSE supported");
23140
23141   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23142   if (VT.is256BitVector() && !Subtarget.hasInt256())
23143     return Lower256IntUnary(Op, DAG);
23144
23145   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23146   // two nibbles and a PSHUFB lookup to find the bitreverse of each
23147   // 0-15 value (moved to the other nibble).
23148   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23149   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23150   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23151
23152   const int LoLUT[16] = {
23153       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23154       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23155       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23156       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23157   const int HiLUT[16] = {
23158       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23159       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23160       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23161       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23162
23163   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23164   for (unsigned i = 0; i < NumElts; ++i) {
23165     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23166     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23167   }
23168
23169   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23170   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23171   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23172   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23173   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23174 }
23175
23176 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23177   unsigned NewOpc = 0;
23178   switch (N->getOpcode()) {
23179   case ISD::ATOMIC_LOAD_ADD:
23180     NewOpc = X86ISD::LADD;
23181     break;
23182   case ISD::ATOMIC_LOAD_SUB:
23183     NewOpc = X86ISD::LSUB;
23184     break;
23185   case ISD::ATOMIC_LOAD_OR:
23186     NewOpc = X86ISD::LOR;
23187     break;
23188   case ISD::ATOMIC_LOAD_XOR:
23189     NewOpc = X86ISD::LXOR;
23190     break;
23191   case ISD::ATOMIC_LOAD_AND:
23192     NewOpc = X86ISD::LAND;
23193     break;
23194   default:
23195     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23196   }
23197
23198   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23199   return DAG.getMemIntrinsicNode(
23200       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23201       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23202       /*MemVT=*/N->getSimpleValueType(0), MMO);
23203 }
23204
23205 /// Lower atomic_load_ops into LOCK-prefixed operations.
23206 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23207                                 const X86Subtarget &Subtarget) {
23208   SDValue Chain = N->getOperand(0);
23209   SDValue LHS = N->getOperand(1);
23210   SDValue RHS = N->getOperand(2);
23211   unsigned Opc = N->getOpcode();
23212   MVT VT = N->getSimpleValueType(0);
23213   SDLoc DL(N);
23214
23215   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23216   // can only be lowered when the result is unused.  They should have already
23217   // been transformed into a cmpxchg loop in AtomicExpand.
23218   if (N->hasAnyUseOfValue(0)) {
23219     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23220     // select LXADD if LOCK_SUB can't be selected.
23221     if (Opc == ISD::ATOMIC_LOAD_SUB) {
23222       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23223       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23224       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23225                            RHS, AN->getMemOperand());
23226     }
23227     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23228            "Used AtomicRMW ops other than Add should have been expanded!");
23229     return N;
23230   }
23231
23232   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23233   // RAUW the chain, but don't worry about the result, as it's unused.
23234   assert(!N->hasAnyUseOfValue(0));
23235   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23236   return SDValue();
23237 }
23238
23239 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23240   SDNode *Node = Op.getNode();
23241   SDLoc dl(Node);
23242   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23243
23244   // Convert seq_cst store -> xchg
23245   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23246   // FIXME: On 32-bit, store -> fist or movq would be more efficient
23247   //        (The only way to get a 16-byte store is cmpxchg16b)
23248   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23249   if (cast<AtomicSDNode>(Node)->getOrdering() ==
23250           AtomicOrdering::SequentiallyConsistent ||
23251       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23252     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23253                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
23254                                  Node->getOperand(0),
23255                                  Node->getOperand(1), Node->getOperand(2),
23256                                  cast<AtomicSDNode>(Node)->getMemOperand());
23257     return Swap.getValue(1);
23258   }
23259   // Other atomic stores have a simple pattern.
23260   return Op;
23261 }
23262
23263 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23264   SDNode *N = Op.getNode();
23265   MVT VT = N->getSimpleValueType(0);
23266
23267   // Let legalize expand this if it isn't a legal type yet.
23268   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23269     return SDValue();
23270
23271   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23272   SDLoc DL(N);
23273
23274   // Set the carry flag.
23275   SDValue Carry = Op.getOperand(2);
23276   EVT CarryVT = Carry.getValueType();
23277   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23278   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23279                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
23280
23281   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23282   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23283                             Op.getOperand(1), Carry.getValue(1));
23284
23285   SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23286   if (N->getValueType(1) == MVT::i1)
23287     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23288
23289   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23290 }
23291
23292 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23293                             SelectionDAG &DAG) {
23294   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23295
23296   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23297   // which returns the values as { float, float } (in XMM0) or
23298   // { double, double } (which is returned in XMM0, XMM1).
23299   SDLoc dl(Op);
23300   SDValue Arg = Op.getOperand(0);
23301   EVT ArgVT = Arg.getValueType();
23302   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23303
23304   TargetLowering::ArgListTy Args;
23305   TargetLowering::ArgListEntry Entry;
23306
23307   Entry.Node = Arg;
23308   Entry.Ty = ArgTy;
23309   Entry.IsSExt = false;
23310   Entry.IsZExt = false;
23311   Args.push_back(Entry);
23312
23313   bool isF64 = ArgVT == MVT::f64;
23314   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23315   // the small struct {f32, f32} is returned in (eax, edx). For f64,
23316   // the results are returned via SRet in memory.
23317   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
23318   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23319   SDValue Callee =
23320       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23321
23322   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23323                       : (Type *)VectorType::get(ArgTy, 4);
23324
23325   TargetLowering::CallLoweringInfo CLI(DAG);
23326   CLI.setDebugLoc(dl)
23327       .setChain(DAG.getEntryNode())
23328       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23329
23330   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23331
23332   if (isF64)
23333     // Returned in xmm0 and xmm1.
23334     return CallResult.first;
23335
23336   // Returned in bits 0:31 and 32:64 xmm0.
23337   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23338                                CallResult.first, DAG.getIntPtrConstant(0, dl));
23339   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23340                                CallResult.first, DAG.getIntPtrConstant(1, dl));
23341   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23342   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23343 }
23344
23345 /// Widen a vector input to a vector of NVT.  The
23346 /// input vector must have the same element type as NVT.
23347 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23348                             bool FillWithZeroes = false) {
23349   // Check if InOp already has the right width.
23350   MVT InVT = InOp.getSimpleValueType();
23351   if (InVT == NVT)
23352     return InOp;
23353
23354   if (InOp.isUndef())
23355     return DAG.getUNDEF(NVT);
23356
23357   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23358          "input and widen element type must match");
23359
23360   unsigned InNumElts = InVT.getVectorNumElements();
23361   unsigned WidenNumElts = NVT.getVectorNumElements();
23362   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23363          "Unexpected request for vector widening");
23364
23365   SDLoc dl(InOp);
23366   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23367       InOp.getNumOperands() == 2) {
23368     SDValue N1 = InOp.getOperand(1);
23369     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23370         N1.isUndef()) {
23371       InOp = InOp.getOperand(0);
23372       InVT = InOp.getSimpleValueType();
23373       InNumElts = InVT.getVectorNumElements();
23374     }
23375   }
23376   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23377       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23378     SmallVector<SDValue, 16> Ops;
23379     for (unsigned i = 0; i < InNumElts; ++i)
23380       Ops.push_back(InOp.getOperand(i));
23381
23382     EVT EltVT = InOp.getOperand(0).getValueType();
23383
23384     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23385       DAG.getUNDEF(EltVT);
23386     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23387       Ops.push_back(FillVal);
23388     return DAG.getBuildVector(NVT, dl, Ops);
23389   }
23390   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23391     DAG.getUNDEF(NVT);
23392   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23393                      InOp, DAG.getIntPtrConstant(0, dl));
23394 }
23395
23396 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23397                              SelectionDAG &DAG) {
23398   assert(Subtarget.hasAVX512() &&
23399          "MGATHER/MSCATTER are supported on AVX-512 arch only");
23400
23401   // X86 scatter kills mask register, so its type should be added to
23402   // the list of return values.
23403   // If the "scatter" has 2 return values, it is already handled.
23404   if (Op.getNode()->getNumValues() == 2)
23405     return Op;
23406
23407   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23408   SDValue Src = N->getValue();
23409   MVT VT = Src.getSimpleValueType();
23410   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23411   SDLoc dl(Op);
23412
23413   SDValue NewScatter;
23414   SDValue Index = N->getIndex();
23415   SDValue Mask = N->getMask();
23416   SDValue Chain = N->getChain();
23417   SDValue BasePtr = N->getBasePtr();
23418   MVT MemVT = N->getMemoryVT().getSimpleVT();
23419   MVT IndexVT = Index.getSimpleValueType();
23420   MVT MaskVT = Mask.getSimpleValueType();
23421
23422   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23423     // The v2i32 value was promoted to v2i64.
23424     // Now we "redo" the type legalizer's work and widen the original
23425     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23426     // with a shuffle.
23427     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23428            "Unexpected memory type");
23429     int ShuffleMask[] = {0, 2, -1, -1};
23430     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23431                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23432     // Now we have 4 elements instead of 2.
23433     // Expand the index.
23434     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23435     Index = ExtendToType(Index, NewIndexVT, DAG);
23436
23437     // Expand the mask with zeroes
23438     // Mask may be <2 x i64> or <2 x i1> at this moment
23439     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23440            "Unexpected mask type");
23441     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23442     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23443     VT = MVT::v4i32;
23444   }
23445
23446   unsigned NumElts = VT.getVectorNumElements();
23447   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23448       !Index.getSimpleValueType().is512BitVector()) {
23449     // AVX512F supports only 512-bit vectors. Or data or index should
23450     // be 512 bit wide. If now the both index and data are 256-bit, but
23451     // the vector contains 8 elements, we just sign-extend the index
23452     if (IndexVT == MVT::v8i32)
23453       // Just extend index
23454       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23455     else {
23456       // The minimal number of elts in scatter is 8
23457       NumElts = 8;
23458       // Index
23459       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23460       // Use original index here, do not modify the index twice
23461       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23462       if (IndexVT.getScalarType() == MVT::i32)
23463         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23464
23465       // Mask
23466       // At this point we have promoted mask operand
23467       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23468       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23469       // Use the original mask here, do not modify the mask twice
23470       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23471
23472       // The value that should be stored
23473       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23474       Src = ExtendToType(Src, NewVT, DAG);
23475     }
23476   }
23477   // If the mask is "wide" at this point - truncate it to i1 vector
23478   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23479   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23480
23481   // The mask is killed by scatter, add it to the values
23482   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23483   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23484   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23485                                     N->getMemOperand());
23486   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23487   return SDValue(NewScatter.getNode(), 1);
23488 }
23489
23490 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23491                           SelectionDAG &DAG) {
23492
23493   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23494   MVT VT = Op.getSimpleValueType();
23495   MVT ScalarVT = VT.getScalarType();
23496   SDValue Mask = N->getMask();
23497   SDLoc dl(Op);
23498
23499   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23500          "Expanding masked load is supported on AVX-512 target only!");
23501
23502   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23503          "Expanding masked load is supported for 32 and 64-bit types only!");
23504
23505   // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23506   // VLX. These types for exp-loads are handled here.
23507   if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23508     return Op;
23509
23510   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23511          "Cannot lower masked load op.");
23512
23513   assert((ScalarVT.getSizeInBits() >= 32 ||
23514           (Subtarget.hasBWI() &&
23515               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23516          "Unsupported masked load op.");
23517
23518   // This operation is legal for targets with VLX, but without
23519   // VLX the vector should be widened to 512 bit
23520   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23521   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23522   SDValue Src0 = N->getSrc0();
23523   Src0 = ExtendToType(Src0, WideDataVT, DAG);
23524
23525   // Mask element has to be i1.
23526   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23527   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23528          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23529
23530   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23531
23532   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23533   if (MaskEltTy != MVT::i1)
23534     Mask = DAG.getNode(ISD::TRUNCATE, dl,
23535                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23536   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23537                                       N->getBasePtr(), Mask, Src0,
23538                                       N->getMemoryVT(), N->getMemOperand(),
23539                                       N->getExtensionType(),
23540                                       N->isExpandingLoad());
23541
23542   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23543                                NewLoad.getValue(0),
23544                                DAG.getIntPtrConstant(0, dl));
23545   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23546   return DAG.getMergeValues(RetOps, dl);
23547 }
23548
23549 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23550                            SelectionDAG &DAG) {
23551   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23552   SDValue DataToStore = N->getValue();
23553   MVT VT = DataToStore.getSimpleValueType();
23554   MVT ScalarVT = VT.getScalarType();
23555   SDValue Mask = N->getMask();
23556   SDLoc dl(Op);
23557
23558   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23559          "Expanding masked load is supported on AVX-512 target only!");
23560
23561   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23562          "Expanding masked load is supported for 32 and 64-bit types only!");
23563
23564   // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23565   if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23566     return Op;
23567
23568   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23569          "Cannot lower masked store op.");
23570
23571   assert((ScalarVT.getSizeInBits() >= 32 ||
23572           (Subtarget.hasBWI() &&
23573               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23574           "Unsupported masked store op.");
23575
23576   // This operation is legal for targets with VLX, but without
23577   // VLX the vector should be widened to 512 bit
23578   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23579   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23580
23581   // Mask element has to be i1.
23582   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23583   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23584          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23585
23586   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23587
23588   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23589   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23590   if (MaskEltTy != MVT::i1)
23591     Mask = DAG.getNode(ISD::TRUNCATE, dl,
23592                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23593   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23594                             Mask, N->getMemoryVT(), N->getMemOperand(),
23595                             N->isTruncatingStore(), N->isCompressingStore());
23596 }
23597
23598 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23599                             SelectionDAG &DAG) {
23600   assert(Subtarget.hasAVX512() &&
23601          "MGATHER/MSCATTER are supported on AVX-512 arch only");
23602
23603   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23604   SDLoc dl(Op);
23605   MVT VT = Op.getSimpleValueType();
23606   SDValue Index = N->getIndex();
23607   SDValue Mask = N->getMask();
23608   SDValue Src0 = N->getValue();
23609   MVT IndexVT = Index.getSimpleValueType();
23610   MVT MaskVT = Mask.getSimpleValueType();
23611
23612   unsigned NumElts = VT.getVectorNumElements();
23613   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23614
23615   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23616       !Index.getSimpleValueType().is512BitVector()) {
23617     // AVX512F supports only 512-bit vectors. Or data or index should
23618     // be 512 bit wide. If now the both index and data are 256-bit, but
23619     // the vector contains 8 elements, we just sign-extend the index
23620     if (NumElts == 8) {
23621       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23622       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
23623                         N->getOperand(3), Index };
23624       DAG.UpdateNodeOperands(N, Ops);
23625       return Op;
23626     }
23627
23628     // Minimal number of elements in Gather
23629     NumElts = 8;
23630     // Index
23631     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23632     Index = ExtendToType(Index, NewIndexVT, DAG);
23633     if (IndexVT.getScalarType() == MVT::i32)
23634       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23635
23636     // Mask
23637     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23638     // At this point we have promoted mask operand
23639     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23640     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23641     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23642     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23643
23644     // The pass-through value
23645     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23646     Src0 = ExtendToType(Src0, NewVT, DAG);
23647
23648     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23649     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23650                                             N->getMemoryVT(), dl, Ops,
23651                                             N->getMemOperand());
23652     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23653                                  NewGather.getValue(0),
23654                                  DAG.getIntPtrConstant(0, dl));
23655     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23656     return DAG.getMergeValues(RetOps, dl);
23657   }
23658   return Op;
23659 }
23660
23661 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23662                                                     SelectionDAG &DAG) const {
23663   // TODO: Eventually, the lowering of these nodes should be informed by or
23664   // deferred to the GC strategy for the function in which they appear. For
23665   // now, however, they must be lowered to something. Since they are logically
23666   // no-ops in the case of a null GC strategy (or a GC strategy which does not
23667   // require special handling for these nodes), lower them as literal NOOPs for
23668   // the time being.
23669   SmallVector<SDValue, 2> Ops;
23670
23671   Ops.push_back(Op.getOperand(0));
23672   if (Op->getGluedNode())
23673     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23674
23675   SDLoc OpDL(Op);
23676   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23677   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23678
23679   return NOOP;
23680 }
23681
23682 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23683                                                   SelectionDAG &DAG) const {
23684   // TODO: Eventually, the lowering of these nodes should be informed by or
23685   // deferred to the GC strategy for the function in which they appear. For
23686   // now, however, they must be lowered to something. Since they are logically
23687   // no-ops in the case of a null GC strategy (or a GC strategy which does not
23688   // require special handling for these nodes), lower them as literal NOOPs for
23689   // the time being.
23690   SmallVector<SDValue, 2> Ops;
23691
23692   Ops.push_back(Op.getOperand(0));
23693   if (Op->getGluedNode())
23694     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23695
23696   SDLoc OpDL(Op);
23697   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23698   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23699
23700   return NOOP;
23701 }
23702
23703 /// Provide custom lowering hooks for some operations.
23704 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23705   switch (Op.getOpcode()) {
23706   default: llvm_unreachable("Should not custom lower this!");
23707   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23708   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23709     return LowerCMP_SWAP(Op, Subtarget, DAG);
23710   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
23711   case ISD::ATOMIC_LOAD_ADD:
23712   case ISD::ATOMIC_LOAD_SUB:
23713   case ISD::ATOMIC_LOAD_OR:
23714   case ISD::ATOMIC_LOAD_XOR:
23715   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
23716   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
23717   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
23718   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
23719   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23720   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
23721   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
23722   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23723   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
23724   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23725   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23726   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
23727   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
23728   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
23729   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
23730   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
23731   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
23732   case ISD::SHL_PARTS:
23733   case ISD::SRA_PARTS:
23734   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
23735   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
23736   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
23737   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
23738   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
23739   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23740   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
23741   case ISD::ZERO_EXTEND_VECTOR_INREG:
23742   case ISD::SIGN_EXTEND_VECTOR_INREG:
23743     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23744   case ISD::FP_TO_SINT:
23745   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
23746   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
23747   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
23748   case ISD::FABS:
23749   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
23750   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
23751   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
23752   case ISD::SETCC:              return LowerSETCC(Op, DAG);
23753   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
23754   case ISD::SELECT:             return LowerSELECT(Op, DAG);
23755   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
23756   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
23757   case ISD::VASTART:            return LowerVASTART(Op, DAG);
23758   case ISD::VAARG:              return LowerVAARG(Op, DAG);
23759   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
23760   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23761   case ISD::INTRINSIC_VOID:
23762   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23763   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
23764   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
23765   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
23766   case ISD::FRAME_TO_ARGS_OFFSET:
23767                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23768   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23769   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
23770   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
23771   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
23772   case ISD::EH_SJLJ_SETUP_DISPATCH:
23773     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23774   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
23775   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
23776   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
23777   case ISD::CTLZ:
23778   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
23779   case ISD::CTTZ:
23780   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
23781   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
23782   case ISD::MULHS:
23783   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
23784   case ISD::UMUL_LOHI:
23785   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
23786   case ISD::ROTL:               return LowerRotate(Op, Subtarget, DAG);
23787   case ISD::SRA:
23788   case ISD::SRL:
23789   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
23790   case ISD::SADDO:
23791   case ISD::UADDO:
23792   case ISD::SSUBO:
23793   case ISD::USUBO:
23794   case ISD::SMULO:
23795   case ISD::UMULO:              return LowerXALUO(Op, DAG);
23796   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23797   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
23798   case ISD::ADDCARRY:
23799   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
23800   case ISD::ADD:
23801   case ISD::SUB:                return LowerADD_SUB(Op, DAG);
23802   case ISD::SMAX:
23803   case ISD::SMIN:
23804   case ISD::UMAX:
23805   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
23806   case ISD::ABS:                return LowerABS(Op, DAG);
23807   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
23808   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
23809   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
23810   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
23811   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
23812   case ISD::GC_TRANSITION_START:
23813                                 return LowerGC_TRANSITION_START(Op, DAG);
23814   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
23815   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
23816   }
23817 }
23818
23819 /// Places new result values for the node in Results (their number
23820 /// and types must exactly match those of the original return values of
23821 /// the node), or leaves Results empty, which indicates that the node is not
23822 /// to be custom lowered after all.
23823 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23824                                               SmallVectorImpl<SDValue> &Results,
23825                                               SelectionDAG &DAG) const {
23826   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23827
23828   if (!Res.getNode())
23829     return;
23830
23831   assert((N->getNumValues() <= Res->getNumValues()) &&
23832       "Lowering returned the wrong number of results!");
23833
23834   // Places new result values base on N result number.
23835   // In some cases (LowerSINT_TO_FP for example) Res has more result values
23836   // than original node, chain should be dropped(last value).
23837   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23838     Results.push_back(Res.getValue(I));
23839 }
23840
23841 /// Replace a node with an illegal result type with a new node built out of
23842 /// custom code.
23843 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23844                                            SmallVectorImpl<SDValue>&Results,
23845                                            SelectionDAG &DAG) const {
23846   SDLoc dl(N);
23847   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23848   switch (N->getOpcode()) {
23849   default:
23850     llvm_unreachable("Do not know how to custom type legalize this operation!");
23851   case X86ISD::AVG: {
23852     // Legalize types for X86ISD::AVG by expanding vectors.
23853     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23854
23855     auto InVT = N->getValueType(0);
23856     auto InVTSize = InVT.getSizeInBits();
23857     const unsigned RegSize =
23858         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23859     assert((Subtarget.hasBWI() || RegSize < 512) &&
23860            "512-bit vector requires AVX512BW");
23861     assert((Subtarget.hasAVX2() || RegSize < 256) &&
23862            "256-bit vector requires AVX2");
23863
23864     auto ElemVT = InVT.getVectorElementType();
23865     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23866                                   RegSize / ElemVT.getSizeInBits());
23867     assert(RegSize % InVT.getSizeInBits() == 0);
23868     unsigned NumConcat = RegSize / InVT.getSizeInBits();
23869
23870     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23871     Ops[0] = N->getOperand(0);
23872     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23873     Ops[0] = N->getOperand(1);
23874     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23875
23876     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23877     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23878                                   DAG.getIntPtrConstant(0, dl)));
23879     return;
23880   }
23881   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23882   case X86ISD::FMINC:
23883   case X86ISD::FMIN:
23884   case X86ISD::FMAXC:
23885   case X86ISD::FMAX: {
23886     EVT VT = N->getValueType(0);
23887     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23888     SDValue UNDEF = DAG.getUNDEF(VT);
23889     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23890                               N->getOperand(0), UNDEF);
23891     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23892                               N->getOperand(1), UNDEF);
23893     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23894     return;
23895   }
23896   case ISD::SDIV:
23897   case ISD::UDIV:
23898   case ISD::SREM:
23899   case ISD::UREM:
23900   case ISD::SDIVREM:
23901   case ISD::UDIVREM: {
23902     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23903     Results.push_back(V);
23904     return;
23905   }
23906   case ISD::FP_TO_SINT:
23907   case ISD::FP_TO_UINT: {
23908     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23909
23910     if (N->getValueType(0) == MVT::v2i32) {
23911       assert((IsSigned || Subtarget.hasAVX512()) &&
23912              "Can only handle signed conversion without AVX512");
23913       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23914       SDValue Src = N->getOperand(0);
23915       if (Src.getValueType() == MVT::v2f64) {
23916         SDValue Idx = DAG.getIntPtrConstant(0, dl);
23917         SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23918                                            : X86ISD::CVTTP2UI,
23919                                   dl, MVT::v4i32, Src);
23920         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23921         Results.push_back(Res);
23922         return;
23923       }
23924       if (Src.getValueType() == MVT::v2f32) {
23925         SDValue Idx = DAG.getIntPtrConstant(0, dl);
23926         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23927                                   DAG.getUNDEF(MVT::v2f32));
23928         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23929                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23930         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23931         Results.push_back(Res);
23932         return;
23933       }
23934
23935       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23936       // so early out here.
23937       return;
23938     }
23939
23940     std::pair<SDValue,SDValue> Vals =
23941         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23942     SDValue FIST = Vals.first, StackSlot = Vals.second;
23943     if (FIST.getNode()) {
23944       EVT VT = N->getValueType(0);
23945       // Return a load from the stack slot.
23946       if (StackSlot.getNode())
23947         Results.push_back(
23948             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23949       else
23950         Results.push_back(FIST);
23951     }
23952     return;
23953   }
23954   case ISD::SINT_TO_FP: {
23955     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23956     SDValue Src = N->getOperand(0);
23957     if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23958       return;
23959     Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23960     return;
23961   }
23962   case ISD::UINT_TO_FP: {
23963     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23964     EVT VT = N->getValueType(0);
23965     if (VT != MVT::v2f32)
23966       return;
23967     SDValue Src = N->getOperand(0);
23968     EVT SrcVT = Src.getValueType();
23969     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23970       Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23971       return;
23972     }
23973     if (SrcVT != MVT::v2i32)
23974       return;
23975     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23976     SDValue VBias =
23977         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23978     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23979                              DAG.getBitcast(MVT::v2i64, VBias));
23980     Or = DAG.getBitcast(MVT::v2f64, Or);
23981     // TODO: Are there any fast-math-flags to propagate here?
23982     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23983     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23984     return;
23985   }
23986   case ISD::FP_ROUND: {
23987     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23988         return;
23989     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23990     Results.push_back(V);
23991     return;
23992   }
23993   case ISD::FP_EXTEND: {
23994     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23995     // No other ValueType for FP_EXTEND should reach this point.
23996     assert(N->getValueType(0) == MVT::v2f32 &&
23997            "Do not know how to legalize this Node");
23998     return;
23999   }
24000   case ISD::INTRINSIC_W_CHAIN: {
24001     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24002     switch (IntNo) {
24003     default : llvm_unreachable("Do not know how to custom type "
24004                                "legalize this intrinsic operation!");
24005     case Intrinsic::x86_rdtsc:
24006       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24007                                      Results);
24008     case Intrinsic::x86_rdtscp:
24009       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24010                                      Results);
24011     case Intrinsic::x86_rdpmc:
24012       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24013
24014     case Intrinsic::x86_xgetbv:
24015       return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24016     }
24017   }
24018   case ISD::INTRINSIC_WO_CHAIN: {
24019     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24020       Results.push_back(V);
24021     return;
24022   }
24023   case ISD::READCYCLECOUNTER: {
24024     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24025                                    Results);
24026   }
24027   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24028     EVT T = N->getValueType(0);
24029     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24030     bool Regs64bit = T == MVT::i128;
24031     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24032     SDValue cpInL, cpInH;
24033     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24034                         DAG.getConstant(0, dl, HalfT));
24035     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24036                         DAG.getConstant(1, dl, HalfT));
24037     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24038                              Regs64bit ? X86::RAX : X86::EAX,
24039                              cpInL, SDValue());
24040     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24041                              Regs64bit ? X86::RDX : X86::EDX,
24042                              cpInH, cpInL.getValue(1));
24043     SDValue swapInL, swapInH;
24044     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24045                           DAG.getConstant(0, dl, HalfT));
24046     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24047                           DAG.getConstant(1, dl, HalfT));
24048     swapInH =
24049         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24050                          swapInH, cpInH.getValue(1));
24051     // If the current function needs the base pointer, RBX,
24052     // we shouldn't use cmpxchg directly.
24053     // Indeed the lowering of that instruction will clobber
24054     // that register and since RBX will be a reserved register
24055     // the register allocator will not make sure its value will
24056     // be properly saved and restored around this live-range.
24057     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24058     SDValue Result;
24059     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24060     unsigned BasePtr = TRI->getBaseRegister();
24061     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24062     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24063         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24064       // ISel prefers the LCMPXCHG64 variant.
24065       // If that assert breaks, that means it is not the case anymore,
24066       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24067       // not just EBX. This is a matter of accepting i64 input for that
24068       // pseudo, and restoring into the register of the right wide
24069       // in expand pseudo. Everything else should just work.
24070       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24071              "Saving only half of the RBX");
24072       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24073                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24074       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24075                                            Regs64bit ? X86::RBX : X86::EBX,
24076                                            HalfT, swapInH.getValue(1));
24077       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24078                        RBXSave,
24079                        /*Glue*/ RBXSave.getValue(2)};
24080       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24081     } else {
24082       unsigned Opcode =
24083           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24084       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24085                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
24086                                  swapInH.getValue(1));
24087       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24088                        swapInL.getValue(1)};
24089       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24090     }
24091     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24092                                         Regs64bit ? X86::RAX : X86::EAX,
24093                                         HalfT, Result.getValue(1));
24094     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24095                                         Regs64bit ? X86::RDX : X86::EDX,
24096                                         HalfT, cpOutL.getValue(2));
24097     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24098
24099     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24100                                         MVT::i32, cpOutH.getValue(2));
24101     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24102     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24103
24104     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24105     Results.push_back(Success);
24106     Results.push_back(EFLAGS.getValue(1));
24107     return;
24108   }
24109   case ISD::ATOMIC_SWAP:
24110   case ISD::ATOMIC_LOAD_ADD:
24111   case ISD::ATOMIC_LOAD_SUB:
24112   case ISD::ATOMIC_LOAD_AND:
24113   case ISD::ATOMIC_LOAD_OR:
24114   case ISD::ATOMIC_LOAD_XOR:
24115   case ISD::ATOMIC_LOAD_NAND:
24116   case ISD::ATOMIC_LOAD_MIN:
24117   case ISD::ATOMIC_LOAD_MAX:
24118   case ISD::ATOMIC_LOAD_UMIN:
24119   case ISD::ATOMIC_LOAD_UMAX:
24120   case ISD::ATOMIC_LOAD: {
24121     // Delegate to generic TypeLegalization. Situations we can really handle
24122     // should have already been dealt with by AtomicExpandPass.cpp.
24123     break;
24124   }
24125   case ISD::BITCAST: {
24126     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24127     EVT DstVT = N->getValueType(0);
24128     EVT SrcVT = N->getOperand(0)->getValueType(0);
24129
24130     if (SrcVT != MVT::f64 ||
24131         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24132       return;
24133
24134     unsigned NumElts = DstVT.getVectorNumElements();
24135     EVT SVT = DstVT.getVectorElementType();
24136     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24137     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24138                                    MVT::v2f64, N->getOperand(0));
24139     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24140
24141     if (ExperimentalVectorWideningLegalization) {
24142       // If we are legalizing vectors by widening, we already have the desired
24143       // legal vector type, just return it.
24144       Results.push_back(ToVecInt);
24145       return;
24146     }
24147
24148     SmallVector<SDValue, 8> Elts;
24149     for (unsigned i = 0, e = NumElts; i != e; ++i)
24150       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24151                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
24152
24153     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24154   }
24155   }
24156 }
24157
24158 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24159   switch ((X86ISD::NodeType)Opcode) {
24160   case X86ISD::FIRST_NUMBER:       break;
24161   case X86ISD::BSF:                return "X86ISD::BSF";
24162   case X86ISD::BSR:                return "X86ISD::BSR";
24163   case X86ISD::SHLD:               return "X86ISD::SHLD";
24164   case X86ISD::SHRD:               return "X86ISD::SHRD";
24165   case X86ISD::FAND:               return "X86ISD::FAND";
24166   case X86ISD::FANDN:              return "X86ISD::FANDN";
24167   case X86ISD::FOR:                return "X86ISD::FOR";
24168   case X86ISD::FXOR:               return "X86ISD::FXOR";
24169   case X86ISD::FILD:               return "X86ISD::FILD";
24170   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
24171   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24172   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24173   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24174   case X86ISD::FLD:                return "X86ISD::FLD";
24175   case X86ISD::FST:                return "X86ISD::FST";
24176   case X86ISD::CALL:               return "X86ISD::CALL";
24177   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
24178   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
24179   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
24180   case X86ISD::BT:                 return "X86ISD::BT";
24181   case X86ISD::CMP:                return "X86ISD::CMP";
24182   case X86ISD::COMI:               return "X86ISD::COMI";
24183   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
24184   case X86ISD::CMPM:               return "X86ISD::CMPM";
24185   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
24186   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
24187   case X86ISD::SETCC:              return "X86ISD::SETCC";
24188   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
24189   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
24190   case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
24191   case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
24192   case X86ISD::CMOV:               return "X86ISD::CMOV";
24193   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
24194   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
24195   case X86ISD::IRET:               return "X86ISD::IRET";
24196   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
24197   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
24198   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
24199   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
24200   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
24201   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
24202   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
24203   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
24204   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
24205   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
24206   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
24207   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
24208   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
24209   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
24210   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
24211   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
24212   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
24213   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
24214   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
24215   case X86ISD::HADD:               return "X86ISD::HADD";
24216   case X86ISD::HSUB:               return "X86ISD::HSUB";
24217   case X86ISD::FHADD:              return "X86ISD::FHADD";
24218   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
24219   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
24220   case X86ISD::FMAX:               return "X86ISD::FMAX";
24221   case X86ISD::FMAXS:              return "X86ISD::FMAXS";
24222   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
24223   case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
24224   case X86ISD::FMIN:               return "X86ISD::FMIN";
24225   case X86ISD::FMINS:              return "X86ISD::FMINS";
24226   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
24227   case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
24228   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
24229   case X86ISD::FMINC:              return "X86ISD::FMINC";
24230   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
24231   case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
24232   case X86ISD::FRCP:               return "X86ISD::FRCP";
24233   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
24234   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
24235   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
24236   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
24237   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
24238   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
24239   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
24240   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
24241   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24242     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24243   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
24244   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
24245   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
24246   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
24247   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
24248   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
24249   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
24250   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24251     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24252   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24253     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24254   case X86ISD::LADD:               return "X86ISD::LADD";
24255   case X86ISD::LSUB:               return "X86ISD::LSUB";
24256   case X86ISD::LOR:                return "X86ISD::LOR";
24257   case X86ISD::LXOR:               return "X86ISD::LXOR";
24258   case X86ISD::LAND:               return "X86ISD::LAND";
24259   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
24260   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
24261   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
24262   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
24263   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
24264   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
24265   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
24266   case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
24267   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
24268   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
24269   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
24270   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
24271   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
24272   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
24273   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
24274   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
24275   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
24276   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
24277   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
24278   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
24279   case X86ISD::VSHL:               return "X86ISD::VSHL";
24280   case X86ISD::VSRL:               return "X86ISD::VSRL";
24281   case X86ISD::VSRA:               return "X86ISD::VSRA";
24282   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
24283   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
24284   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
24285   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
24286   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
24287   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
24288   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
24289   case X86ISD::CMPP:               return "X86ISD::CMPP";
24290   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
24291   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
24292   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
24293   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
24294   case X86ISD::ADD:                return "X86ISD::ADD";
24295   case X86ISD::SUB:                return "X86ISD::SUB";
24296   case X86ISD::ADC:                return "X86ISD::ADC";
24297   case X86ISD::SBB:                return "X86ISD::SBB";
24298   case X86ISD::SMUL:               return "X86ISD::SMUL";
24299   case X86ISD::UMUL:               return "X86ISD::UMUL";
24300   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
24301   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
24302   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24303   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24304   case X86ISD::INC:                return "X86ISD::INC";
24305   case X86ISD::DEC:                return "X86ISD::DEC";
24306   case X86ISD::OR:                 return "X86ISD::OR";
24307   case X86ISD::XOR:                return "X86ISD::XOR";
24308   case X86ISD::AND:                return "X86ISD::AND";
24309   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
24310   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
24311   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
24312   case X86ISD::PTEST:              return "X86ISD::PTEST";
24313   case X86ISD::TESTP:              return "X86ISD::TESTP";
24314   case X86ISD::TESTM:              return "X86ISD::TESTM";
24315   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
24316   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
24317   case X86ISD::KTEST:              return "X86ISD::KTEST";
24318   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
24319   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
24320   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
24321   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
24322   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
24323   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
24324   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
24325   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
24326   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
24327   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
24328   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
24329   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
24330   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
24331   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
24332   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
24333   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
24334   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
24335   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
24336   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
24337   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
24338   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
24339   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
24340   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
24341   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
24342   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
24343   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
24344   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
24345   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
24346   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
24347   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
24348   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
24349   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
24350   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
24351   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
24352   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
24353   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
24354   case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
24355   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
24356   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
24357   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
24358   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
24359   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
24360   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24361   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
24362   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
24363   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
24364   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
24365   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
24366   case X86ISD::SAHF:               return "X86ISD::SAHF";
24367   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
24368   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
24369   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
24370   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
24371   case X86ISD::VPROT:              return "X86ISD::VPROT";
24372   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
24373   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
24374   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
24375   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
24376   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
24377   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
24378   case X86ISD::FMADD:              return "X86ISD::FMADD";
24379   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
24380   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
24381   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
24382   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
24383   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
24384   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
24385   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
24386   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
24387   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
24388   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
24389   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
24390   case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
24391   case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
24392   case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
24393   case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
24394   case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
24395   case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
24396   case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
24397   case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
24398   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
24399   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
24400   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
24401   case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
24402   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
24403   case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
24404   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
24405   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
24406   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
24407   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
24408   case X86ISD::XTEST:              return "X86ISD::XTEST";
24409   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
24410   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
24411   case X86ISD::SELECT:             return "X86ISD::SELECT";
24412   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
24413   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
24414   case X86ISD::RCP28:              return "X86ISD::RCP28";
24415   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
24416   case X86ISD::EXP2:               return "X86ISD::EXP2";
24417   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
24418   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
24419   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
24420   case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
24421   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
24422   case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
24423   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
24424   case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
24425   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
24426   case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
24427   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
24428   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
24429   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
24430   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
24431   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
24432   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
24433   case X86ISD::ADDS:               return "X86ISD::ADDS";
24434   case X86ISD::SUBS:               return "X86ISD::SUBS";
24435   case X86ISD::AVG:                return "X86ISD::AVG";
24436   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
24437   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
24438   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
24439   case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
24440   case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
24441   case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
24442   case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
24443   case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
24444   case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
24445   case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
24446   case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
24447   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
24448   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
24449   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
24450   case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24451   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24452   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
24453   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
24454   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
24455   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
24456   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
24457   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
24458   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
24459   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
24460   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
24461   }
24462   return nullptr;
24463 }
24464
24465 /// Return true if the addressing mode represented by AM is legal for this
24466 /// target, for a load/store of the specified type.
24467 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24468                                               const AddrMode &AM, Type *Ty,
24469                                               unsigned AS) const {
24470   // X86 supports extremely general addressing modes.
24471   CodeModel::Model M = getTargetMachine().getCodeModel();
24472
24473   // X86 allows a sign-extended 32-bit immediate field as a displacement.
24474   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24475     return false;
24476
24477   if (AM.BaseGV) {
24478     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24479
24480     // If a reference to this global requires an extra load, we can't fold it.
24481     if (isGlobalStubReference(GVFlags))
24482       return false;
24483
24484     // If BaseGV requires a register for the PIC base, we cannot also have a
24485     // BaseReg specified.
24486     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24487       return false;
24488
24489     // If lower 4G is not available, then we must use rip-relative addressing.
24490     if ((M != CodeModel::Small || isPositionIndependent()) &&
24491         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24492       return false;
24493   }
24494
24495   switch (AM.Scale) {
24496   case 0:
24497   case 1:
24498   case 2:
24499   case 4:
24500   case 8:
24501     // These scales always work.
24502     break;
24503   case 3:
24504   case 5:
24505   case 9:
24506     // These scales are formed with basereg+scalereg.  Only accept if there is
24507     // no basereg yet.
24508     if (AM.HasBaseReg)
24509       return false;
24510     break;
24511   default:  // Other stuff never works.
24512     return false;
24513   }
24514
24515   return true;
24516 }
24517
24518 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24519   unsigned Bits = Ty->getScalarSizeInBits();
24520
24521   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24522   // particularly cheaper than those without.
24523   if (Bits == 8)
24524     return false;
24525
24526   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24527   // variable shifts just as cheap as scalar ones.
24528   if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24529     return false;
24530
24531   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24532   // fully general vector.
24533   return true;
24534 }
24535
24536 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24537   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24538     return false;
24539   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24540   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24541   return NumBits1 > NumBits2;
24542 }
24543
24544 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24545   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24546     return false;
24547
24548   if (!isTypeLegal(EVT::getEVT(Ty1)))
24549     return false;
24550
24551   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24552
24553   // Assuming the caller doesn't have a zeroext or signext return parameter,
24554   // truncation all the way down to i1 is valid.
24555   return true;
24556 }
24557
24558 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24559   return isInt<32>(Imm);
24560 }
24561
24562 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24563   // Can also use sub to handle negated immediates.
24564   return isInt<32>(Imm);
24565 }
24566
24567 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24568   if (!VT1.isInteger() || !VT2.isInteger())
24569     return false;
24570   unsigned NumBits1 = VT1.getSizeInBits();
24571   unsigned NumBits2 = VT2.getSizeInBits();
24572   return NumBits1 > NumBits2;
24573 }
24574
24575 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24576   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24577   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24578 }
24579
24580 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24581   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24582   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24583 }
24584
24585 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24586   EVT VT1 = Val.getValueType();
24587   if (isZExtFree(VT1, VT2))
24588     return true;
24589
24590   if (Val.getOpcode() != ISD::LOAD)
24591     return false;
24592
24593   if (!VT1.isSimple() || !VT1.isInteger() ||
24594       !VT2.isSimple() || !VT2.isInteger())
24595     return false;
24596
24597   switch (VT1.getSimpleVT().SimpleTy) {
24598   default: break;
24599   case MVT::i8:
24600   case MVT::i16:
24601   case MVT::i32:
24602     // X86 has 8, 16, and 32-bit zero-extending loads.
24603     return true;
24604   }
24605
24606   return false;
24607 }
24608
24609 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24610
24611 bool
24612 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24613   if (!Subtarget.hasAnyFMA())
24614     return false;
24615
24616   VT = VT.getScalarType();
24617
24618   if (!VT.isSimple())
24619     return false;
24620
24621   switch (VT.getSimpleVT().SimpleTy) {
24622   case MVT::f32:
24623   case MVT::f64:
24624     return true;
24625   default:
24626     break;
24627   }
24628
24629   return false;
24630 }
24631
24632 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24633   // i16 instructions are longer (0x66 prefix) and potentially slower.
24634   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24635 }
24636
24637 /// Targets can use this to indicate that they only support *some*
24638 /// VECTOR_SHUFFLE operations, those with specific masks.
24639 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24640 /// are assumed to be legal.
24641 bool
24642 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24643                                       EVT VT) const {
24644   if (!VT.isSimple())
24645     return false;
24646
24647   // Not for i1 vectors
24648   if (VT.getSimpleVT().getScalarType() == MVT::i1)
24649     return false;
24650
24651   // Very little shuffling can be done for 64-bit vectors right now.
24652   if (VT.getSimpleVT().getSizeInBits() == 64)
24653     return false;
24654
24655   // We only care that the types being shuffled are legal. The lowering can
24656   // handle any possible shuffle mask that results.
24657   return isTypeLegal(VT.getSimpleVT());
24658 }
24659
24660 bool
24661 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24662                                           EVT VT) const {
24663   // Just delegate to the generic legality, clear masks aren't special.
24664   return isShuffleMaskLegal(Mask, VT);
24665 }
24666
24667 //===----------------------------------------------------------------------===//
24668 //                           X86 Scheduler Hooks
24669 //===----------------------------------------------------------------------===//
24670
24671 /// Utility function to emit xbegin specifying the start of an RTM region.
24672 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24673                                      const TargetInstrInfo *TII) {
24674   DebugLoc DL = MI.getDebugLoc();
24675
24676   const BasicBlock *BB = MBB->getBasicBlock();
24677   MachineFunction::iterator I = ++MBB->getIterator();
24678
24679   // For the v = xbegin(), we generate
24680   //
24681   // thisMBB:
24682   //  xbegin sinkMBB
24683   //
24684   // mainMBB:
24685   //  s0 = -1
24686   //
24687   // fallBB:
24688   //  eax = # XABORT_DEF
24689   //  s1 = eax
24690   //
24691   // sinkMBB:
24692   //  v = phi(s0/mainBB, s1/fallBB)
24693
24694   MachineBasicBlock *thisMBB = MBB;
24695   MachineFunction *MF = MBB->getParent();
24696   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24697   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24698   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24699   MF->insert(I, mainMBB);
24700   MF->insert(I, fallMBB);
24701   MF->insert(I, sinkMBB);
24702
24703   // Transfer the remainder of BB and its successor edges to sinkMBB.
24704   sinkMBB->splice(sinkMBB->begin(), MBB,
24705                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24706   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24707
24708   MachineRegisterInfo &MRI = MF->getRegInfo();
24709   unsigned DstReg = MI.getOperand(0).getReg();
24710   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
24711   unsigned mainDstReg = MRI.createVirtualRegister(RC);
24712   unsigned fallDstReg = MRI.createVirtualRegister(RC);
24713
24714   // thisMBB:
24715   //  xbegin fallMBB
24716   //  # fallthrough to mainMBB
24717   //  # abortion to fallMBB
24718   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
24719   thisMBB->addSuccessor(mainMBB);
24720   thisMBB->addSuccessor(fallMBB);
24721
24722   // mainMBB:
24723   //  mainDstReg := -1
24724   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
24725   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
24726   mainMBB->addSuccessor(sinkMBB);
24727
24728   // fallMBB:
24729   //  ; pseudo instruction to model hardware's definition from XABORT
24730   //  EAX := XABORT_DEF
24731   //  fallDstReg := EAX
24732   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
24733   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
24734       .addReg(X86::EAX);
24735   fallMBB->addSuccessor(sinkMBB);
24736
24737   // sinkMBB:
24738   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
24739   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
24740       .addReg(mainDstReg).addMBB(mainMBB)
24741       .addReg(fallDstReg).addMBB(fallMBB);
24742
24743   MI.eraseFromParent();
24744   return sinkMBB;
24745 }
24746
24747 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24748 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24749 // in the .td file.
24750 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24751                                        const TargetInstrInfo *TII) {
24752   unsigned Opc;
24753   switch (MI.getOpcode()) {
24754   default: llvm_unreachable("illegal opcode!");
24755   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
24756   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24757   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
24758   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24759   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
24760   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24761   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
24762   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24763   }
24764
24765   DebugLoc dl = MI.getDebugLoc();
24766   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24767
24768   unsigned NumArgs = MI.getNumOperands();
24769   for (unsigned i = 1; i < NumArgs; ++i) {
24770     MachineOperand &Op = MI.getOperand(i);
24771     if (!(Op.isReg() && Op.isImplicit()))
24772       MIB.add(Op);
24773   }
24774   if (MI.hasOneMemOperand())
24775     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24776
24777   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24778       .addReg(X86::XMM0);
24779
24780   MI.eraseFromParent();
24781   return BB;
24782 }
24783
24784 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24785 // defs in an instruction pattern
24786 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24787                                        const TargetInstrInfo *TII) {
24788   unsigned Opc;
24789   switch (MI.getOpcode()) {
24790   default: llvm_unreachable("illegal opcode!");
24791   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
24792   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24793   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
24794   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24795   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
24796   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24797   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
24798   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24799   }
24800
24801   DebugLoc dl = MI.getDebugLoc();
24802   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24803
24804   unsigned NumArgs = MI.getNumOperands(); // remove the results
24805   for (unsigned i = 1; i < NumArgs; ++i) {
24806     MachineOperand &Op = MI.getOperand(i);
24807     if (!(Op.isReg() && Op.isImplicit()))
24808       MIB.add(Op);
24809   }
24810   if (MI.hasOneMemOperand())
24811     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24812
24813   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24814       .addReg(X86::ECX);
24815
24816   MI.eraseFromParent();
24817   return BB;
24818 }
24819
24820 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24821                                      const X86Subtarget &Subtarget) {
24822   DebugLoc dl = MI.getDebugLoc();
24823   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24824
24825   // insert input VAL into EAX
24826   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24827       .addReg(MI.getOperand(0).getReg());
24828   // insert zero to ECX
24829   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24830
24831   // insert zero to EDX
24832   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24833
24834   // insert WRPKRU instruction
24835   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24836
24837   MI.eraseFromParent(); // The pseudo is gone now.
24838   return BB;
24839 }
24840
24841 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24842                                      const X86Subtarget &Subtarget) {
24843   DebugLoc dl = MI.getDebugLoc();
24844   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24845
24846   // insert zero to ECX
24847   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24848
24849   // insert RDPKRU instruction
24850   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24851   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24852       .addReg(X86::EAX);
24853
24854   MI.eraseFromParent(); // The pseudo is gone now.
24855   return BB;
24856 }
24857
24858 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24859                                       const X86Subtarget &Subtarget,
24860                                       unsigned Opc) {
24861   DebugLoc dl = MI.getDebugLoc();
24862   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24863   // Address into RAX/EAX, other two args into ECX, EDX.
24864   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24865   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24866   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24867   for (int i = 0; i < X86::AddrNumOperands; ++i)
24868     MIB.add(MI.getOperand(i));
24869
24870   unsigned ValOps = X86::AddrNumOperands;
24871   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24872       .addReg(MI.getOperand(ValOps).getReg());
24873   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24874       .addReg(MI.getOperand(ValOps + 1).getReg());
24875
24876   // The instruction doesn't actually take any operands though.
24877   BuildMI(*BB, MI, dl, TII->get(Opc));
24878
24879   MI.eraseFromParent(); // The pseudo is gone now.
24880   return BB;
24881 }
24882
24883 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
24884                                       const X86Subtarget &Subtarget) {
24885   DebugLoc dl = MI->getDebugLoc();
24886   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24887   // Address into RAX/EAX
24888   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24889   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24890   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24891   for (int i = 0; i < X86::AddrNumOperands; ++i)
24892     MIB.add(MI->getOperand(i));
24893
24894   // The instruction doesn't actually take any operands though.
24895   BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
24896
24897   MI->eraseFromParent(); // The pseudo is gone now.
24898   return BB;
24899 }
24900
24901
24902
24903 MachineBasicBlock *
24904 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24905                                                  MachineBasicBlock *MBB) const {
24906   // Emit va_arg instruction on X86-64.
24907
24908   // Operands to this pseudo-instruction:
24909   // 0  ) Output        : destination address (reg)
24910   // 1-5) Input         : va_list address (addr, i64mem)
24911   // 6  ) ArgSize       : Size (in bytes) of vararg type
24912   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24913   // 8  ) Align         : Alignment of type
24914   // 9  ) EFLAGS (implicit-def)
24915
24916   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24917   static_assert(X86::AddrNumOperands == 5,
24918                 "VAARG_64 assumes 5 address operands");
24919
24920   unsigned DestReg = MI.getOperand(0).getReg();
24921   MachineOperand &Base = MI.getOperand(1);
24922   MachineOperand &Scale = MI.getOperand(2);
24923   MachineOperand &Index = MI.getOperand(3);
24924   MachineOperand &Disp = MI.getOperand(4);
24925   MachineOperand &Segment = MI.getOperand(5);
24926   unsigned ArgSize = MI.getOperand(6).getImm();
24927   unsigned ArgMode = MI.getOperand(7).getImm();
24928   unsigned Align = MI.getOperand(8).getImm();
24929
24930   // Memory Reference
24931   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24932   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24933   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24934
24935   // Machine Information
24936   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24937   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24938   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24939   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24940   DebugLoc DL = MI.getDebugLoc();
24941
24942   // struct va_list {
24943   //   i32   gp_offset
24944   //   i32   fp_offset
24945   //   i64   overflow_area (address)
24946   //   i64   reg_save_area (address)
24947   // }
24948   // sizeof(va_list) = 24
24949   // alignment(va_list) = 8
24950
24951   unsigned TotalNumIntRegs = 6;
24952   unsigned TotalNumXMMRegs = 8;
24953   bool UseGPOffset = (ArgMode == 1);
24954   bool UseFPOffset = (ArgMode == 2);
24955   unsigned MaxOffset = TotalNumIntRegs * 8 +
24956                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24957
24958   /* Align ArgSize to a multiple of 8 */
24959   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24960   bool NeedsAlign = (Align > 8);
24961
24962   MachineBasicBlock *thisMBB = MBB;
24963   MachineBasicBlock *overflowMBB;
24964   MachineBasicBlock *offsetMBB;
24965   MachineBasicBlock *endMBB;
24966
24967   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
24968   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
24969   unsigned OffsetReg = 0;
24970
24971   if (!UseGPOffset && !UseFPOffset) {
24972     // If we only pull from the overflow region, we don't create a branch.
24973     // We don't need to alter control flow.
24974     OffsetDestReg = 0; // unused
24975     OverflowDestReg = DestReg;
24976
24977     offsetMBB = nullptr;
24978     overflowMBB = thisMBB;
24979     endMBB = thisMBB;
24980   } else {
24981     // First emit code to check if gp_offset (or fp_offset) is below the bound.
24982     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24983     // If not, pull from overflow_area. (branch to overflowMBB)
24984     //
24985     //       thisMBB
24986     //         |     .
24987     //         |        .
24988     //     offsetMBB   overflowMBB
24989     //         |        .
24990     //         |     .
24991     //        endMBB
24992
24993     // Registers for the PHI in endMBB
24994     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24995     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24996
24997     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24998     MachineFunction *MF = MBB->getParent();
24999     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25000     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25001     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25002
25003     MachineFunction::iterator MBBIter = ++MBB->getIterator();
25004
25005     // Insert the new basic blocks
25006     MF->insert(MBBIter, offsetMBB);
25007     MF->insert(MBBIter, overflowMBB);
25008     MF->insert(MBBIter, endMBB);
25009
25010     // Transfer the remainder of MBB and its successor edges to endMBB.
25011     endMBB->splice(endMBB->begin(), thisMBB,
25012                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25013     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25014
25015     // Make offsetMBB and overflowMBB successors of thisMBB
25016     thisMBB->addSuccessor(offsetMBB);
25017     thisMBB->addSuccessor(overflowMBB);
25018
25019     // endMBB is a successor of both offsetMBB and overflowMBB
25020     offsetMBB->addSuccessor(endMBB);
25021     overflowMBB->addSuccessor(endMBB);
25022
25023     // Load the offset value into a register
25024     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25025     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25026         .add(Base)
25027         .add(Scale)
25028         .add(Index)
25029         .addDisp(Disp, UseFPOffset ? 4 : 0)
25030         .add(Segment)
25031         .setMemRefs(MMOBegin, MMOEnd);
25032
25033     // Check if there is enough room left to pull this argument.
25034     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25035       .addReg(OffsetReg)
25036       .addImm(MaxOffset + 8 - ArgSizeA8);
25037
25038     // Branch to "overflowMBB" if offset >= max
25039     // Fall through to "offsetMBB" otherwise
25040     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25041       .addMBB(overflowMBB);
25042   }
25043
25044   // In offsetMBB, emit code to use the reg_save_area.
25045   if (offsetMBB) {
25046     assert(OffsetReg != 0);
25047
25048     // Read the reg_save_area address.
25049     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25050     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25051         .add(Base)
25052         .add(Scale)
25053         .add(Index)
25054         .addDisp(Disp, 16)
25055         .add(Segment)
25056         .setMemRefs(MMOBegin, MMOEnd);
25057
25058     // Zero-extend the offset
25059     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25060       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25061         .addImm(0)
25062         .addReg(OffsetReg)
25063         .addImm(X86::sub_32bit);
25064
25065     // Add the offset to the reg_save_area to get the final address.
25066     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25067       .addReg(OffsetReg64)
25068       .addReg(RegSaveReg);
25069
25070     // Compute the offset for the next argument
25071     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25072     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25073       .addReg(OffsetReg)
25074       .addImm(UseFPOffset ? 16 : 8);
25075
25076     // Store it back into the va_list.
25077     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25078         .add(Base)
25079         .add(Scale)
25080         .add(Index)
25081         .addDisp(Disp, UseFPOffset ? 4 : 0)
25082         .add(Segment)
25083         .addReg(NextOffsetReg)
25084         .setMemRefs(MMOBegin, MMOEnd);
25085
25086     // Jump to endMBB
25087     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25088       .addMBB(endMBB);
25089   }
25090
25091   //
25092   // Emit code to use overflow area
25093   //
25094
25095   // Load the overflow_area address into a register.
25096   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25097   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25098       .add(Base)
25099       .add(Scale)
25100       .add(Index)
25101       .addDisp(Disp, 8)
25102       .add(Segment)
25103       .setMemRefs(MMOBegin, MMOEnd);
25104
25105   // If we need to align it, do so. Otherwise, just copy the address
25106   // to OverflowDestReg.
25107   if (NeedsAlign) {
25108     // Align the overflow address
25109     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25110     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25111
25112     // aligned_addr = (addr + (align-1)) & ~(align-1)
25113     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25114       .addReg(OverflowAddrReg)
25115       .addImm(Align-1);
25116
25117     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25118       .addReg(TmpReg)
25119       .addImm(~(uint64_t)(Align-1));
25120   } else {
25121     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25122       .addReg(OverflowAddrReg);
25123   }
25124
25125   // Compute the next overflow address after this argument.
25126   // (the overflow address should be kept 8-byte aligned)
25127   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25128   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25129     .addReg(OverflowDestReg)
25130     .addImm(ArgSizeA8);
25131
25132   // Store the new overflow address.
25133   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25134       .add(Base)
25135       .add(Scale)
25136       .add(Index)
25137       .addDisp(Disp, 8)
25138       .add(Segment)
25139       .addReg(NextAddrReg)
25140       .setMemRefs(MMOBegin, MMOEnd);
25141
25142   // If we branched, emit the PHI to the front of endMBB.
25143   if (offsetMBB) {
25144     BuildMI(*endMBB, endMBB->begin(), DL,
25145             TII->get(X86::PHI), DestReg)
25146       .addReg(OffsetDestReg).addMBB(offsetMBB)
25147       .addReg(OverflowDestReg).addMBB(overflowMBB);
25148   }
25149
25150   // Erase the pseudo instruction
25151   MI.eraseFromParent();
25152
25153   return endMBB;
25154 }
25155
25156 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25157     MachineInstr &MI, MachineBasicBlock *MBB) const {
25158   // Emit code to save XMM registers to the stack. The ABI says that the
25159   // number of registers to save is given in %al, so it's theoretically
25160   // possible to do an indirect jump trick to avoid saving all of them,
25161   // however this code takes a simpler approach and just executes all
25162   // of the stores if %al is non-zero. It's less code, and it's probably
25163   // easier on the hardware branch predictor, and stores aren't all that
25164   // expensive anyway.
25165
25166   // Create the new basic blocks. One block contains all the XMM stores,
25167   // and one block is the final destination regardless of whether any
25168   // stores were performed.
25169   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25170   MachineFunction *F = MBB->getParent();
25171   MachineFunction::iterator MBBIter = ++MBB->getIterator();
25172   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25173   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25174   F->insert(MBBIter, XMMSaveMBB);
25175   F->insert(MBBIter, EndMBB);
25176
25177   // Transfer the remainder of MBB and its successor edges to EndMBB.
25178   EndMBB->splice(EndMBB->begin(), MBB,
25179                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25180   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25181
25182   // The original block will now fall through to the XMM save block.
25183   MBB->addSuccessor(XMMSaveMBB);
25184   // The XMMSaveMBB will fall through to the end block.
25185   XMMSaveMBB->addSuccessor(EndMBB);
25186
25187   // Now add the instructions.
25188   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25189   DebugLoc DL = MI.getDebugLoc();
25190
25191   unsigned CountReg = MI.getOperand(0).getReg();
25192   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25193   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25194
25195   if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25196     // If %al is 0, branch around the XMM save block.
25197     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25198     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25199     MBB->addSuccessor(EndMBB);
25200   }
25201
25202   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25203   // that was just emitted, but clearly shouldn't be "saved".
25204   assert((MI.getNumOperands() <= 3 ||
25205           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25206           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25207          "Expected last argument to be EFLAGS");
25208   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25209   // In the XMM save block, save all the XMM argument registers.
25210   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25211     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25212     MachineMemOperand *MMO = F->getMachineMemOperand(
25213         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25214         MachineMemOperand::MOStore,
25215         /*Size=*/16, /*Align=*/16);
25216     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25217         .addFrameIndex(RegSaveFrameIndex)
25218         .addImm(/*Scale=*/1)
25219         .addReg(/*IndexReg=*/0)
25220         .addImm(/*Disp=*/Offset)
25221         .addReg(/*Segment=*/0)
25222         .addReg(MI.getOperand(i).getReg())
25223         .addMemOperand(MMO);
25224   }
25225
25226   MI.eraseFromParent(); // The pseudo instruction is gone now.
25227
25228   return EndMBB;
25229 }
25230
25231 // The EFLAGS operand of SelectItr might be missing a kill marker
25232 // because there were multiple uses of EFLAGS, and ISel didn't know
25233 // which to mark. Figure out whether SelectItr should have had a
25234 // kill marker, and set it if it should. Returns the correct kill
25235 // marker value.
25236 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25237                                      MachineBasicBlock* BB,
25238                                      const TargetRegisterInfo* TRI) {
25239   // Scan forward through BB for a use/def of EFLAGS.
25240   MachineBasicBlock::iterator miI(std::next(SelectItr));
25241   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25242     const MachineInstr& mi = *miI;
25243     if (mi.readsRegister(X86::EFLAGS))
25244       return false;
25245     if (mi.definesRegister(X86::EFLAGS))
25246       break; // Should have kill-flag - update below.
25247   }
25248
25249   // If we hit the end of the block, check whether EFLAGS is live into a
25250   // successor.
25251   if (miI == BB->end()) {
25252     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25253                                           sEnd = BB->succ_end();
25254          sItr != sEnd; ++sItr) {
25255       MachineBasicBlock* succ = *sItr;
25256       if (succ->isLiveIn(X86::EFLAGS))
25257         return false;
25258     }
25259   }
25260
25261   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25262   // out. SelectMI should have a kill flag on EFLAGS.
25263   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25264   return true;
25265 }
25266
25267 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25268 // together with other CMOV pseudo-opcodes into a single basic-block with
25269 // conditional jump around it.
25270 static bool isCMOVPseudo(MachineInstr &MI) {
25271   switch (MI.getOpcode()) {
25272   case X86::CMOV_FR32:
25273   case X86::CMOV_FR64:
25274   case X86::CMOV_GR8:
25275   case X86::CMOV_GR16:
25276   case X86::CMOV_GR32:
25277   case X86::CMOV_RFP32:
25278   case X86::CMOV_RFP64:
25279   case X86::CMOV_RFP80:
25280   case X86::CMOV_V2F64:
25281   case X86::CMOV_V2I64:
25282   case X86::CMOV_V4F32:
25283   case X86::CMOV_V4F64:
25284   case X86::CMOV_V4I64:
25285   case X86::CMOV_V16F32:
25286   case X86::CMOV_V8F32:
25287   case X86::CMOV_V8F64:
25288   case X86::CMOV_V8I64:
25289   case X86::CMOV_V8I1:
25290   case X86::CMOV_V16I1:
25291   case X86::CMOV_V32I1:
25292   case X86::CMOV_V64I1:
25293     return true;
25294
25295   default:
25296     return false;
25297   }
25298 }
25299
25300 MachineBasicBlock *
25301 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25302                                      MachineBasicBlock *BB) const {
25303   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25304   DebugLoc DL = MI.getDebugLoc();
25305
25306   // To "insert" a SELECT_CC instruction, we actually have to insert the
25307   // diamond control-flow pattern.  The incoming instruction knows the
25308   // destination vreg to set, the condition code register to branch on, the
25309   // true/false values to select between, and a branch opcode to use.
25310   const BasicBlock *LLVM_BB = BB->getBasicBlock();
25311   MachineFunction::iterator It = ++BB->getIterator();
25312
25313   //  thisMBB:
25314   //  ...
25315   //   TrueVal = ...
25316   //   cmpTY ccX, r1, r2
25317   //   bCC copy1MBB
25318   //   fallthrough --> copy0MBB
25319   MachineBasicBlock *thisMBB = BB;
25320   MachineFunction *F = BB->getParent();
25321
25322   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25323   // as described above, by inserting a BB, and then making a PHI at the join
25324   // point to select the true and false operands of the CMOV in the PHI.
25325   //
25326   // The code also handles two different cases of multiple CMOV opcodes
25327   // in a row.
25328   //
25329   // Case 1:
25330   // In this case, there are multiple CMOVs in a row, all which are based on
25331   // the same condition setting (or the exact opposite condition setting).
25332   // In this case we can lower all the CMOVs using a single inserted BB, and
25333   // then make a number of PHIs at the join point to model the CMOVs. The only
25334   // trickiness here, is that in a case like:
25335   //
25336   // t2 = CMOV cond1 t1, f1
25337   // t3 = CMOV cond1 t2, f2
25338   //
25339   // when rewriting this into PHIs, we have to perform some renaming on the
25340   // temps since you cannot have a PHI operand refer to a PHI result earlier
25341   // in the same block.  The "simple" but wrong lowering would be:
25342   //
25343   // t2 = PHI t1(BB1), f1(BB2)
25344   // t3 = PHI t2(BB1), f2(BB2)
25345   //
25346   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25347   // renaming is to note that on the path through BB1, t2 is really just a
25348   // copy of t1, and do that renaming, properly generating:
25349   //
25350   // t2 = PHI t1(BB1), f1(BB2)
25351   // t3 = PHI t1(BB1), f2(BB2)
25352   //
25353   // Case 2, we lower cascaded CMOVs such as
25354   //
25355   //   (CMOV (CMOV F, T, cc1), T, cc2)
25356   //
25357   // to two successive branches.  For that, we look for another CMOV as the
25358   // following instruction.
25359   //
25360   // Without this, we would add a PHI between the two jumps, which ends up
25361   // creating a few copies all around. For instance, for
25362   //
25363   //    (sitofp (zext (fcmp une)))
25364   //
25365   // we would generate:
25366   //
25367   //         ucomiss %xmm1, %xmm0
25368   //         movss  <1.0f>, %xmm0
25369   //         movaps  %xmm0, %xmm1
25370   //         jne     .LBB5_2
25371   //         xorps   %xmm1, %xmm1
25372   // .LBB5_2:
25373   //         jp      .LBB5_4
25374   //         movaps  %xmm1, %xmm0
25375   // .LBB5_4:
25376   //         retq
25377   //
25378   // because this custom-inserter would have generated:
25379   //
25380   //   A
25381   //   | \
25382   //   |  B
25383   //   | /
25384   //   C
25385   //   | \
25386   //   |  D
25387   //   | /
25388   //   E
25389   //
25390   // A: X = ...; Y = ...
25391   // B: empty
25392   // C: Z = PHI [X, A], [Y, B]
25393   // D: empty
25394   // E: PHI [X, C], [Z, D]
25395   //
25396   // If we lower both CMOVs in a single step, we can instead generate:
25397   //
25398   //   A
25399   //   | \
25400   //   |  C
25401   //   | /|
25402   //   |/ |
25403   //   |  |
25404   //   |  D
25405   //   | /
25406   //   E
25407   //
25408   // A: X = ...; Y = ...
25409   // D: empty
25410   // E: PHI [X, A], [X, C], [Y, D]
25411   //
25412   // Which, in our sitofp/fcmp example, gives us something like:
25413   //
25414   //         ucomiss %xmm1, %xmm0
25415   //         movss  <1.0f>, %xmm0
25416   //         jne     .LBB5_4
25417   //         jp      .LBB5_4
25418   //         xorps   %xmm0, %xmm0
25419   // .LBB5_4:
25420   //         retq
25421   //
25422   MachineInstr *CascadedCMOV = nullptr;
25423   MachineInstr *LastCMOV = &MI;
25424   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25425   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25426   MachineBasicBlock::iterator NextMIIt =
25427       std::next(MachineBasicBlock::iterator(MI));
25428
25429   // Check for case 1, where there are multiple CMOVs with the same condition
25430   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
25431   // number of jumps the most.
25432
25433   if (isCMOVPseudo(MI)) {
25434     // See if we have a string of CMOVS with the same condition.
25435     while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25436            (NextMIIt->getOperand(3).getImm() == CC ||
25437             NextMIIt->getOperand(3).getImm() == OppCC)) {
25438       LastCMOV = &*NextMIIt;
25439       ++NextMIIt;
25440     }
25441   }
25442
25443   // This checks for case 2, but only do this if we didn't already find
25444   // case 1, as indicated by LastCMOV == MI.
25445   if (LastCMOV == &MI && NextMIIt != BB->end() &&
25446       NextMIIt->getOpcode() == MI.getOpcode() &&
25447       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25448       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25449       NextMIIt->getOperand(1).isKill()) {
25450     CascadedCMOV = &*NextMIIt;
25451   }
25452
25453   MachineBasicBlock *jcc1MBB = nullptr;
25454
25455   // If we have a cascaded CMOV, we lower it to two successive branches to
25456   // the same block.  EFLAGS is used by both, so mark it as live in the second.
25457   if (CascadedCMOV) {
25458     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25459     F->insert(It, jcc1MBB);
25460     jcc1MBB->addLiveIn(X86::EFLAGS);
25461   }
25462
25463   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25464   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25465   F->insert(It, copy0MBB);
25466   F->insert(It, sinkMBB);
25467
25468   // If the EFLAGS register isn't dead in the terminator, then claim that it's
25469   // live into the sink and copy blocks.
25470   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25471
25472   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25473   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25474       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25475     copy0MBB->addLiveIn(X86::EFLAGS);
25476     sinkMBB->addLiveIn(X86::EFLAGS);
25477   }
25478
25479   // Transfer the remainder of BB and its successor edges to sinkMBB.
25480   sinkMBB->splice(sinkMBB->begin(), BB,
25481                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25482   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25483
25484   // Add the true and fallthrough blocks as its successors.
25485   if (CascadedCMOV) {
25486     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25487     BB->addSuccessor(jcc1MBB);
25488
25489     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25490     // jump to the sinkMBB.
25491     jcc1MBB->addSuccessor(copy0MBB);
25492     jcc1MBB->addSuccessor(sinkMBB);
25493   } else {
25494     BB->addSuccessor(copy0MBB);
25495   }
25496
25497   // The true block target of the first (or only) branch is always sinkMBB.
25498   BB->addSuccessor(sinkMBB);
25499
25500   // Create the conditional branch instruction.
25501   unsigned Opc = X86::GetCondBranchFromCond(CC);
25502   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25503
25504   if (CascadedCMOV) {
25505     unsigned Opc2 = X86::GetCondBranchFromCond(
25506         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25507     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25508   }
25509
25510   //  copy0MBB:
25511   //   %FalseValue = ...
25512   //   # fallthrough to sinkMBB
25513   copy0MBB->addSuccessor(sinkMBB);
25514
25515   //  sinkMBB:
25516   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25517   //  ...
25518   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25519   MachineBasicBlock::iterator MIItEnd =
25520     std::next(MachineBasicBlock::iterator(LastCMOV));
25521   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25522   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25523   MachineInstrBuilder MIB;
25524
25525   // As we are creating the PHIs, we have to be careful if there is more than
25526   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
25527   // PHIs have to reference the individual true/false inputs from earlier PHIs.
25528   // That also means that PHI construction must work forward from earlier to
25529   // later, and that the code must maintain a mapping from earlier PHI's
25530   // destination registers, and the registers that went into the PHI.
25531
25532   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25533     unsigned DestReg = MIIt->getOperand(0).getReg();
25534     unsigned Op1Reg = MIIt->getOperand(1).getReg();
25535     unsigned Op2Reg = MIIt->getOperand(2).getReg();
25536
25537     // If this CMOV we are generating is the opposite condition from
25538     // the jump we generated, then we have to swap the operands for the
25539     // PHI that is going to be generated.
25540     if (MIIt->getOperand(3).getImm() == OppCC)
25541         std::swap(Op1Reg, Op2Reg);
25542
25543     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25544       Op1Reg = RegRewriteTable[Op1Reg].first;
25545
25546     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25547       Op2Reg = RegRewriteTable[Op2Reg].second;
25548
25549     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25550                   TII->get(X86::PHI), DestReg)
25551           .addReg(Op1Reg).addMBB(copy0MBB)
25552           .addReg(Op2Reg).addMBB(thisMBB);
25553
25554     // Add this PHI to the rewrite table.
25555     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25556   }
25557
25558   // If we have a cascaded CMOV, the second Jcc provides the same incoming
25559   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25560   if (CascadedCMOV) {
25561     MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25562     // Copy the PHI result to the register defined by the second CMOV.
25563     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25564             DL, TII->get(TargetOpcode::COPY),
25565             CascadedCMOV->getOperand(0).getReg())
25566         .addReg(MI.getOperand(0).getReg());
25567     CascadedCMOV->eraseFromParent();
25568   }
25569
25570   // Now remove the CMOV(s).
25571   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25572     (MIIt++)->eraseFromParent();
25573
25574   return sinkMBB;
25575 }
25576
25577 MachineBasicBlock *
25578 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25579                                        MachineBasicBlock *BB) const {
25580   // Combine the following atomic floating-point modification pattern:
25581   //   a.store(reg OP a.load(acquire), release)
25582   // Transform them into:
25583   //   OPss (%gpr), %xmm
25584   //   movss %xmm, (%gpr)
25585   // Or sd equivalent for 64-bit operations.
25586   unsigned MOp, FOp;
25587   switch (MI.getOpcode()) {
25588   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25589   case X86::RELEASE_FADD32mr:
25590     FOp = X86::ADDSSrm;
25591     MOp = X86::MOVSSmr;
25592     break;
25593   case X86::RELEASE_FADD64mr:
25594     FOp = X86::ADDSDrm;
25595     MOp = X86::MOVSDmr;
25596     break;
25597   }
25598   const X86InstrInfo *TII = Subtarget.getInstrInfo();
25599   DebugLoc DL = MI.getDebugLoc();
25600   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25601   unsigned ValOpIdx = X86::AddrNumOperands;
25602   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25603   MachineInstrBuilder MIB =
25604       BuildMI(*BB, MI, DL, TII->get(FOp),
25605               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25606           .addReg(VSrc);
25607   for (int i = 0; i < X86::AddrNumOperands; ++i) {
25608     MachineOperand &Operand = MI.getOperand(i);
25609     // Clear any kill flags on register operands as we'll create a second
25610     // instruction using the same address operands.
25611     if (Operand.isReg())
25612       Operand.setIsKill(false);
25613     MIB.add(Operand);
25614   }
25615   MachineInstr *FOpMI = MIB;
25616   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25617   for (int i = 0; i < X86::AddrNumOperands; ++i)
25618     MIB.add(MI.getOperand(i));
25619   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25620   MI.eraseFromParent(); // The pseudo instruction is gone now.
25621   return BB;
25622 }
25623
25624 MachineBasicBlock *
25625 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25626                                         MachineBasicBlock *BB) const {
25627   MachineFunction *MF = BB->getParent();
25628   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25629   DebugLoc DL = MI.getDebugLoc();
25630   const BasicBlock *LLVM_BB = BB->getBasicBlock();
25631
25632   assert(MF->shouldSplitStack());
25633
25634   const bool Is64Bit = Subtarget.is64Bit();
25635   const bool IsLP64 = Subtarget.isTarget64BitLP64();
25636
25637   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25638   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25639
25640   // BB:
25641   //  ... [Till the alloca]
25642   // If stacklet is not large enough, jump to mallocMBB
25643   //
25644   // bumpMBB:
25645   //  Allocate by subtracting from RSP
25646   //  Jump to continueMBB
25647   //
25648   // mallocMBB:
25649   //  Allocate by call to runtime
25650   //
25651   // continueMBB:
25652   //  ...
25653   //  [rest of original BB]
25654   //
25655
25656   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25657   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25658   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25659
25660   MachineRegisterInfo &MRI = MF->getRegInfo();
25661   const TargetRegisterClass *AddrRegClass =
25662       getRegClassFor(getPointerTy(MF->getDataLayout()));
25663
25664   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25665            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25666            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25667            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25668            sizeVReg = MI.getOperand(1).getReg(),
25669            physSPReg =
25670                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25671
25672   MachineFunction::iterator MBBIter = ++BB->getIterator();
25673
25674   MF->insert(MBBIter, bumpMBB);
25675   MF->insert(MBBIter, mallocMBB);
25676   MF->insert(MBBIter, continueMBB);
25677
25678   continueMBB->splice(continueMBB->begin(), BB,
25679                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
25680   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25681
25682   // Add code to the main basic block to check if the stack limit has been hit,
25683   // and if so, jump to mallocMBB otherwise to bumpMBB.
25684   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25685   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25686     .addReg(tmpSPVReg).addReg(sizeVReg);
25687   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25688     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25689     .addReg(SPLimitVReg);
25690   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25691
25692   // bumpMBB simply decreases the stack pointer, since we know the current
25693   // stacklet has enough space.
25694   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25695     .addReg(SPLimitVReg);
25696   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25697     .addReg(SPLimitVReg);
25698   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25699
25700   // Calls into a routine in libgcc to allocate more space from the heap.
25701   const uint32_t *RegMask =
25702       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25703   if (IsLP64) {
25704     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25705       .addReg(sizeVReg);
25706     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25707       .addExternalSymbol("__morestack_allocate_stack_space")
25708       .addRegMask(RegMask)
25709       .addReg(X86::RDI, RegState::Implicit)
25710       .addReg(X86::RAX, RegState::ImplicitDefine);
25711   } else if (Is64Bit) {
25712     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25713       .addReg(sizeVReg);
25714     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25715       .addExternalSymbol("__morestack_allocate_stack_space")
25716       .addRegMask(RegMask)
25717       .addReg(X86::EDI, RegState::Implicit)
25718       .addReg(X86::EAX, RegState::ImplicitDefine);
25719   } else {
25720     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25721       .addImm(12);
25722     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25723     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25724       .addExternalSymbol("__morestack_allocate_stack_space")
25725       .addRegMask(RegMask)
25726       .addReg(X86::EAX, RegState::ImplicitDefine);
25727   }
25728
25729   if (!Is64Bit)
25730     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25731       .addImm(16);
25732
25733   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25734     .addReg(IsLP64 ? X86::RAX : X86::EAX);
25735   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25736
25737   // Set up the CFG correctly.
25738   BB->addSuccessor(bumpMBB);
25739   BB->addSuccessor(mallocMBB);
25740   mallocMBB->addSuccessor(continueMBB);
25741   bumpMBB->addSuccessor(continueMBB);
25742
25743   // Take care of the PHI nodes.
25744   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25745           MI.getOperand(0).getReg())
25746       .addReg(mallocPtrVReg)
25747       .addMBB(mallocMBB)
25748       .addReg(bumpSPPtrVReg)
25749       .addMBB(bumpMBB);
25750
25751   // Delete the original pseudo instruction.
25752   MI.eraseFromParent();
25753
25754   // And we're done.
25755   return continueMBB;
25756 }
25757
25758 MachineBasicBlock *
25759 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25760                                        MachineBasicBlock *BB) const {
25761   MachineFunction *MF = BB->getParent();
25762   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25763   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25764   DebugLoc DL = MI.getDebugLoc();
25765
25766   assert(!isAsynchronousEHPersonality(
25767              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25768          "SEH does not use catchret!");
25769
25770   // Only 32-bit EH needs to worry about manually restoring stack pointers.
25771   if (!Subtarget.is32Bit())
25772     return BB;
25773
25774   // C++ EH creates a new target block to hold the restore code, and wires up
25775   // the new block to the return destination with a normal JMP_4.
25776   MachineBasicBlock *RestoreMBB =
25777       MF->CreateMachineBasicBlock(BB->getBasicBlock());
25778   assert(BB->succ_size() == 1);
25779   MF->insert(std::next(BB->getIterator()), RestoreMBB);
25780   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25781   BB->addSuccessor(RestoreMBB);
25782   MI.getOperand(0).setMBB(RestoreMBB);
25783
25784   auto RestoreMBBI = RestoreMBB->begin();
25785   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25786   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25787   return BB;
25788 }
25789
25790 MachineBasicBlock *
25791 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25792                                        MachineBasicBlock *BB) const {
25793   MachineFunction *MF = BB->getParent();
25794   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25795   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25796   // Only 32-bit SEH requires special handling for catchpad.
25797   if (IsSEH && Subtarget.is32Bit()) {
25798     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25799     DebugLoc DL = MI.getDebugLoc();
25800     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25801   }
25802   MI.eraseFromParent();
25803   return BB;
25804 }
25805
25806 MachineBasicBlock *
25807 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25808                                       MachineBasicBlock *BB) const {
25809   // So, here we replace TLSADDR with the sequence:
25810   // adjust_stackdown -> TLSADDR -> adjust_stackup.
25811   // We need this because TLSADDR is lowered into calls
25812   // inside MC, therefore without the two markers shrink-wrapping
25813   // may push the prologue/epilogue pass them.
25814   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25815   DebugLoc DL = MI.getDebugLoc();
25816   MachineFunction &MF = *BB->getParent();
25817
25818   // Emit CALLSEQ_START right before the instruction.
25819   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25820   MachineInstrBuilder CallseqStart =
25821     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
25822   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25823
25824   // Emit CALLSEQ_END right after the instruction.
25825   // We don't call erase from parent because we want to keep the
25826   // original instruction around.
25827   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25828   MachineInstrBuilder CallseqEnd =
25829     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25830   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25831
25832   return BB;
25833 }
25834
25835 MachineBasicBlock *
25836 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25837                                       MachineBasicBlock *BB) const {
25838   // This is pretty easy.  We're taking the value that we received from
25839   // our load from the relocation, sticking it in either RDI (x86-64)
25840   // or EAX and doing an indirect call.  The return value will then
25841   // be in the normal return register.
25842   MachineFunction *F = BB->getParent();
25843   const X86InstrInfo *TII = Subtarget.getInstrInfo();
25844   DebugLoc DL = MI.getDebugLoc();
25845
25846   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25847   assert(MI.getOperand(3).isGlobal() && "This should be a global");
25848
25849   // Get a register mask for the lowered call.
25850   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25851   // proper register mask.
25852   const uint32_t *RegMask =
25853       Subtarget.is64Bit() ?
25854       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25855       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25856   if (Subtarget.is64Bit()) {
25857     MachineInstrBuilder MIB =
25858         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25859             .addReg(X86::RIP)
25860             .addImm(0)
25861             .addReg(0)
25862             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25863                               MI.getOperand(3).getTargetFlags())
25864             .addReg(0);
25865     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25866     addDirectMem(MIB, X86::RDI);
25867     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25868   } else if (!isPositionIndependent()) {
25869     MachineInstrBuilder MIB =
25870         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25871             .addReg(0)
25872             .addImm(0)
25873             .addReg(0)
25874             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25875                               MI.getOperand(3).getTargetFlags())
25876             .addReg(0);
25877     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25878     addDirectMem(MIB, X86::EAX);
25879     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25880   } else {
25881     MachineInstrBuilder MIB =
25882         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25883             .addReg(TII->getGlobalBaseReg(F))
25884             .addImm(0)
25885             .addReg(0)
25886             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25887                               MI.getOperand(3).getTargetFlags())
25888             .addReg(0);
25889     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25890     addDirectMem(MIB, X86::EAX);
25891     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25892   }
25893
25894   MI.eraseFromParent(); // The pseudo instruction is gone now.
25895   return BB;
25896 }
25897
25898 MachineBasicBlock *
25899 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25900                                     MachineBasicBlock *MBB) const {
25901   DebugLoc DL = MI.getDebugLoc();
25902   MachineFunction *MF = MBB->getParent();
25903   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25904   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25905   MachineRegisterInfo &MRI = MF->getRegInfo();
25906
25907   const BasicBlock *BB = MBB->getBasicBlock();
25908   MachineFunction::iterator I = ++MBB->getIterator();
25909
25910   // Memory Reference
25911   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25912   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25913
25914   unsigned DstReg;
25915   unsigned MemOpndSlot = 0;
25916
25917   unsigned CurOp = 0;
25918
25919   DstReg = MI.getOperand(CurOp++).getReg();
25920   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25921   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
25922   (void)TRI;
25923   unsigned mainDstReg = MRI.createVirtualRegister(RC);
25924   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25925
25926   MemOpndSlot = CurOp;
25927
25928   MVT PVT = getPointerTy(MF->getDataLayout());
25929   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25930          "Invalid Pointer Size!");
25931
25932   // For v = setjmp(buf), we generate
25933   //
25934   // thisMBB:
25935   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25936   //  SjLjSetup restoreMBB
25937   //
25938   // mainMBB:
25939   //  v_main = 0
25940   //
25941   // sinkMBB:
25942   //  v = phi(main, restore)
25943   //
25944   // restoreMBB:
25945   //  if base pointer being used, load it from frame
25946   //  v_restore = 1
25947
25948   MachineBasicBlock *thisMBB = MBB;
25949   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25950   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25951   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25952   MF->insert(I, mainMBB);
25953   MF->insert(I, sinkMBB);
25954   MF->push_back(restoreMBB);
25955   restoreMBB->setHasAddressTaken();
25956
25957   MachineInstrBuilder MIB;
25958
25959   // Transfer the remainder of BB and its successor edges to sinkMBB.
25960   sinkMBB->splice(sinkMBB->begin(), MBB,
25961                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25962   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25963
25964   // thisMBB:
25965   unsigned PtrStoreOpc = 0;
25966   unsigned LabelReg = 0;
25967   const int64_t LabelOffset = 1 * PVT.getStoreSize();
25968   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25969                      !isPositionIndependent();
25970
25971   // Prepare IP either in reg or imm.
25972   if (!UseImmLabel) {
25973     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25974     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25975     LabelReg = MRI.createVirtualRegister(PtrRC);
25976     if (Subtarget.is64Bit()) {
25977       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25978               .addReg(X86::RIP)
25979               .addImm(0)
25980               .addReg(0)
25981               .addMBB(restoreMBB)
25982               .addReg(0);
25983     } else {
25984       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25985       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25986               .addReg(XII->getGlobalBaseReg(MF))
25987               .addImm(0)
25988               .addReg(0)
25989               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25990               .addReg(0);
25991     }
25992   } else
25993     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25994   // Store IP
25995   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25996   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25997     if (i == X86::AddrDisp)
25998       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25999     else
26000       MIB.add(MI.getOperand(MemOpndSlot + i));
26001   }
26002   if (!UseImmLabel)
26003     MIB.addReg(LabelReg);
26004   else
26005     MIB.addMBB(restoreMBB);
26006   MIB.setMemRefs(MMOBegin, MMOEnd);
26007   // Setup
26008   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26009           .addMBB(restoreMBB);
26010
26011   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26012   MIB.addRegMask(RegInfo->getNoPreservedMask());
26013   thisMBB->addSuccessor(mainMBB);
26014   thisMBB->addSuccessor(restoreMBB);
26015
26016   // mainMBB:
26017   //  EAX = 0
26018   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26019   mainMBB->addSuccessor(sinkMBB);
26020
26021   // sinkMBB:
26022   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26023           TII->get(X86::PHI), DstReg)
26024     .addReg(mainDstReg).addMBB(mainMBB)
26025     .addReg(restoreDstReg).addMBB(restoreMBB);
26026
26027   // restoreMBB:
26028   if (RegInfo->hasBasePointer(*MF)) {
26029     const bool Uses64BitFramePtr =
26030         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26031     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26032     X86FI->setRestoreBasePointer(MF);
26033     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26034     unsigned BasePtr = RegInfo->getBaseRegister();
26035     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26036     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26037                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
26038       .setMIFlag(MachineInstr::FrameSetup);
26039   }
26040   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26041   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26042   restoreMBB->addSuccessor(sinkMBB);
26043
26044   MI.eraseFromParent();
26045   return sinkMBB;
26046 }
26047
26048 MachineBasicBlock *
26049 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26050                                      MachineBasicBlock *MBB) const {
26051   DebugLoc DL = MI.getDebugLoc();
26052   MachineFunction *MF = MBB->getParent();
26053   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26054   MachineRegisterInfo &MRI = MF->getRegInfo();
26055
26056   // Memory Reference
26057   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26058   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26059
26060   MVT PVT = getPointerTy(MF->getDataLayout());
26061   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26062          "Invalid Pointer Size!");
26063
26064   const TargetRegisterClass *RC =
26065     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26066   unsigned Tmp = MRI.createVirtualRegister(RC);
26067   // Since FP is only updated here but NOT referenced, it's treated as GPR.
26068   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26069   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26070   unsigned SP = RegInfo->getStackRegister();
26071
26072   MachineInstrBuilder MIB;
26073
26074   const int64_t LabelOffset = 1 * PVT.getStoreSize();
26075   const int64_t SPOffset = 2 * PVT.getStoreSize();
26076
26077   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26078   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26079
26080   // Reload FP
26081   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26082   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26083     MIB.add(MI.getOperand(i));
26084   MIB.setMemRefs(MMOBegin, MMOEnd);
26085   // Reload IP
26086   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26087   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26088     if (i == X86::AddrDisp)
26089       MIB.addDisp(MI.getOperand(i), LabelOffset);
26090     else
26091       MIB.add(MI.getOperand(i));
26092   }
26093   MIB.setMemRefs(MMOBegin, MMOEnd);
26094   // Reload SP
26095   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26096   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26097     if (i == X86::AddrDisp)
26098       MIB.addDisp(MI.getOperand(i), SPOffset);
26099     else
26100       MIB.add(MI.getOperand(i));
26101   }
26102   MIB.setMemRefs(MMOBegin, MMOEnd);
26103   // Jump
26104   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26105
26106   MI.eraseFromParent();
26107   return MBB;
26108 }
26109
26110 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26111                                                MachineBasicBlock *MBB,
26112                                                MachineBasicBlock *DispatchBB,
26113                                                int FI) const {
26114   DebugLoc DL = MI.getDebugLoc();
26115   MachineFunction *MF = MBB->getParent();
26116   MachineRegisterInfo *MRI = &MF->getRegInfo();
26117   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26118
26119   MVT PVT = getPointerTy(MF->getDataLayout());
26120   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26121
26122   unsigned Op = 0;
26123   unsigned VR = 0;
26124
26125   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26126                      !isPositionIndependent();
26127
26128   if (UseImmLabel) {
26129     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26130   } else {
26131     const TargetRegisterClass *TRC =
26132         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26133     VR = MRI->createVirtualRegister(TRC);
26134     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26135
26136     if (Subtarget.is64Bit())
26137       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26138           .addReg(X86::RIP)
26139           .addImm(1)
26140           .addReg(0)
26141           .addMBB(DispatchBB)
26142           .addReg(0);
26143     else
26144       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26145           .addReg(0) /* TII->getGlobalBaseReg(MF) */
26146           .addImm(1)
26147           .addReg(0)
26148           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26149           .addReg(0);
26150   }
26151
26152   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26153   addFrameReference(MIB, FI, 36);
26154   if (UseImmLabel)
26155     MIB.addMBB(DispatchBB);
26156   else
26157     MIB.addReg(VR);
26158 }
26159
26160 MachineBasicBlock *
26161 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26162                                          MachineBasicBlock *BB) const {
26163   DebugLoc DL = MI.getDebugLoc();
26164   MachineFunction *MF = BB->getParent();
26165   MachineFrameInfo &MFI = MF->getFrameInfo();
26166   MachineRegisterInfo *MRI = &MF->getRegInfo();
26167   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26168   int FI = MFI.getFunctionContextIndex();
26169
26170   // Get a mapping of the call site numbers to all of the landing pads they're
26171   // associated with.
26172   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26173   unsigned MaxCSNum = 0;
26174   for (auto &MBB : *MF) {
26175     if (!MBB.isEHPad())
26176       continue;
26177
26178     MCSymbol *Sym = nullptr;
26179     for (const auto &MI : MBB) {
26180       if (MI.isDebugValue())
26181         continue;
26182
26183       assert(MI.isEHLabel() && "expected EH_LABEL");
26184       Sym = MI.getOperand(0).getMCSymbol();
26185       break;
26186     }
26187
26188     if (!MF->hasCallSiteLandingPad(Sym))
26189       continue;
26190
26191     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26192       CallSiteNumToLPad[CSI].push_back(&MBB);
26193       MaxCSNum = std::max(MaxCSNum, CSI);
26194     }
26195   }
26196
26197   // Get an ordered list of the machine basic blocks for the jump table.
26198   std::vector<MachineBasicBlock *> LPadList;
26199   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26200   LPadList.reserve(CallSiteNumToLPad.size());
26201
26202   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26203     for (auto &LP : CallSiteNumToLPad[CSI]) {
26204       LPadList.push_back(LP);
26205       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26206     }
26207   }
26208
26209   assert(!LPadList.empty() &&
26210          "No landing pad destinations for the dispatch jump table!");
26211
26212   // Create the MBBs for the dispatch code.
26213
26214   // Shove the dispatch's address into the return slot in the function context.
26215   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26216   DispatchBB->setIsEHPad(true);
26217
26218   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26219   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26220   DispatchBB->addSuccessor(TrapBB);
26221
26222   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26223   DispatchBB->addSuccessor(DispContBB);
26224
26225   // Insert MBBs.
26226   MF->push_back(DispatchBB);
26227   MF->push_back(DispContBB);
26228   MF->push_back(TrapBB);
26229
26230   // Insert code into the entry block that creates and registers the function
26231   // context.
26232   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26233
26234   // Create the jump table and associated information
26235   MachineJumpTableInfo *JTI =
26236       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26237   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26238
26239   const X86RegisterInfo &RI = TII->getRegisterInfo();
26240   // Add a register mask with no preserved registers.  This results in all
26241   // registers being marked as clobbered.
26242   if (RI.hasBasePointer(*MF)) {
26243     const bool FPIs64Bit =
26244         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26245     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26246     MFI->setRestoreBasePointer(MF);
26247
26248     unsigned FP = RI.getFrameRegister(*MF);
26249     unsigned BP = RI.getBaseRegister();
26250     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26251     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26252                  MFI->getRestoreBasePointerOffset())
26253         .addRegMask(RI.getNoPreservedMask());
26254   } else {
26255     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26256         .addRegMask(RI.getNoPreservedMask());
26257   }
26258
26259   unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26260   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26261                     4);
26262   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26263       .addReg(IReg)
26264       .addImm(LPadList.size());
26265   BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26266
26267   unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26268   BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26269       .addReg(IReg)
26270       .addImm(1);
26271   BuildMI(DispContBB, DL,
26272           TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26273       .addReg(0)
26274       .addImm(Subtarget.is64Bit() ? 8 : 4)
26275       .addReg(JReg)
26276       .addJumpTableIndex(MJTI)
26277       .addReg(0);
26278
26279   // Add the jump table entries as successors to the MBB.
26280   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26281   for (auto &LP : LPadList)
26282     if (SeenMBBs.insert(LP).second)
26283       DispContBB->addSuccessor(LP);
26284
26285   // N.B. the order the invoke BBs are processed in doesn't matter here.
26286   SmallVector<MachineBasicBlock *, 64> MBBLPads;
26287   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26288   for (MachineBasicBlock *MBB : InvokeBBs) {
26289     // Remove the landing pad successor from the invoke block and replace it
26290     // with the new dispatch block.
26291     // Keep a copy of Successors since it's modified inside the loop.
26292     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26293                                                    MBB->succ_rend());
26294     // FIXME: Avoid quadratic complexity.
26295     for (auto MBBS : Successors) {
26296       if (MBBS->isEHPad()) {
26297         MBB->removeSuccessor(MBBS);
26298         MBBLPads.push_back(MBBS);
26299       }
26300     }
26301
26302     MBB->addSuccessor(DispatchBB);
26303
26304     // Find the invoke call and mark all of the callee-saved registers as
26305     // 'implicit defined' so that they're spilled.  This prevents code from
26306     // moving instructions to before the EH block, where they will never be
26307     // executed.
26308     for (auto &II : reverse(*MBB)) {
26309       if (!II.isCall())
26310         continue;
26311
26312       DenseMap<unsigned, bool> DefRegs;
26313       for (auto &MOp : II.operands())
26314         if (MOp.isReg())
26315           DefRegs[MOp.getReg()] = true;
26316
26317       MachineInstrBuilder MIB(*MF, &II);
26318       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26319         unsigned Reg = SavedRegs[RI];
26320         if (!DefRegs[Reg])
26321           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26322       }
26323
26324       break;
26325     }
26326   }
26327
26328   // Mark all former landing pads as non-landing pads.  The dispatch is the only
26329   // landing pad now.
26330   for (auto &LP : MBBLPads)
26331     LP->setIsEHPad(false);
26332
26333   // The instruction is gone now.
26334   MI.eraseFromParent();
26335   return BB;
26336 }
26337
26338 MachineBasicBlock *
26339 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26340                                                MachineBasicBlock *BB) const {
26341   MachineFunction *MF = BB->getParent();
26342   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26343   DebugLoc DL = MI.getDebugLoc();
26344
26345   switch (MI.getOpcode()) {
26346   default: llvm_unreachable("Unexpected instr type to insert");
26347   case X86::TAILJMPd64:
26348   case X86::TAILJMPr64:
26349   case X86::TAILJMPm64:
26350   case X86::TAILJMPr64_REX:
26351   case X86::TAILJMPm64_REX:
26352     llvm_unreachable("TAILJMP64 would not be touched here.");
26353   case X86::TCRETURNdi64:
26354   case X86::TCRETURNri64:
26355   case X86::TCRETURNmi64:
26356     return BB;
26357   case X86::TLS_addr32:
26358   case X86::TLS_addr64:
26359   case X86::TLS_base_addr32:
26360   case X86::TLS_base_addr64:
26361     return EmitLoweredTLSAddr(MI, BB);
26362   case X86::CATCHRET:
26363     return EmitLoweredCatchRet(MI, BB);
26364   case X86::CATCHPAD:
26365     return EmitLoweredCatchPad(MI, BB);
26366   case X86::SEG_ALLOCA_32:
26367   case X86::SEG_ALLOCA_64:
26368     return EmitLoweredSegAlloca(MI, BB);
26369   case X86::TLSCall_32:
26370   case X86::TLSCall_64:
26371     return EmitLoweredTLSCall(MI, BB);
26372   case X86::CMOV_FR32:
26373   case X86::CMOV_FR64:
26374   case X86::CMOV_FR128:
26375   case X86::CMOV_GR8:
26376   case X86::CMOV_GR16:
26377   case X86::CMOV_GR32:
26378   case X86::CMOV_RFP32:
26379   case X86::CMOV_RFP64:
26380   case X86::CMOV_RFP80:
26381   case X86::CMOV_V2F64:
26382   case X86::CMOV_V2I64:
26383   case X86::CMOV_V4F32:
26384   case X86::CMOV_V4F64:
26385   case X86::CMOV_V4I64:
26386   case X86::CMOV_V16F32:
26387   case X86::CMOV_V8F32:
26388   case X86::CMOV_V8F64:
26389   case X86::CMOV_V8I64:
26390   case X86::CMOV_V8I1:
26391   case X86::CMOV_V16I1:
26392   case X86::CMOV_V32I1:
26393   case X86::CMOV_V64I1:
26394     return EmitLoweredSelect(MI, BB);
26395
26396   case X86::RDFLAGS32:
26397   case X86::RDFLAGS64: {
26398     unsigned PushF =
26399         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26400     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26401     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26402     // Permit reads of the FLAGS register without it being defined.
26403     // This intrinsic exists to read external processor state in flags, such as
26404     // the trap flag, interrupt flag, and direction flag, none of which are
26405     // modeled by the backend.
26406     Push->getOperand(2).setIsUndef();
26407     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26408
26409     MI.eraseFromParent(); // The pseudo is gone now.
26410     return BB;
26411   }
26412
26413   case X86::WRFLAGS32:
26414   case X86::WRFLAGS64: {
26415     unsigned Push =
26416         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26417     unsigned PopF =
26418         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26419     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26420     BuildMI(*BB, MI, DL, TII->get(PopF));
26421
26422     MI.eraseFromParent(); // The pseudo is gone now.
26423     return BB;
26424   }
26425
26426   case X86::RELEASE_FADD32mr:
26427   case X86::RELEASE_FADD64mr:
26428     return EmitLoweredAtomicFP(MI, BB);
26429
26430   case X86::FP32_TO_INT16_IN_MEM:
26431   case X86::FP32_TO_INT32_IN_MEM:
26432   case X86::FP32_TO_INT64_IN_MEM:
26433   case X86::FP64_TO_INT16_IN_MEM:
26434   case X86::FP64_TO_INT32_IN_MEM:
26435   case X86::FP64_TO_INT64_IN_MEM:
26436   case X86::FP80_TO_INT16_IN_MEM:
26437   case X86::FP80_TO_INT32_IN_MEM:
26438   case X86::FP80_TO_INT64_IN_MEM: {
26439     // Change the floating point control register to use "round towards zero"
26440     // mode when truncating to an integer value.
26441     int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26442     addFrameReference(BuildMI(*BB, MI, DL,
26443                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
26444
26445     // Load the old value of the high byte of the control word...
26446     unsigned OldCW =
26447       MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26448     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26449                       CWFrameIdx);
26450
26451     // Set the high part to be round to zero...
26452     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26453       .addImm(0xC7F);
26454
26455     // Reload the modified control word now...
26456     addFrameReference(BuildMI(*BB, MI, DL,
26457                               TII->get(X86::FLDCW16m)), CWFrameIdx);
26458
26459     // Restore the memory image of control word to original value
26460     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26461       .addReg(OldCW);
26462
26463     // Get the X86 opcode to use.
26464     unsigned Opc;
26465     switch (MI.getOpcode()) {
26466     default: llvm_unreachable("illegal opcode!");
26467     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26468     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26469     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26470     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26471     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26472     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26473     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26474     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26475     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26476     }
26477
26478     X86AddressMode AM = getAddressFromInstr(&MI, 0);
26479     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26480         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26481
26482     // Reload the original control word now.
26483     addFrameReference(BuildMI(*BB, MI, DL,
26484                               TII->get(X86::FLDCW16m)), CWFrameIdx);
26485
26486     MI.eraseFromParent(); // The pseudo instruction is gone now.
26487     return BB;
26488   }
26489     // String/text processing lowering.
26490   case X86::PCMPISTRM128REG:
26491   case X86::VPCMPISTRM128REG:
26492   case X86::PCMPISTRM128MEM:
26493   case X86::VPCMPISTRM128MEM:
26494   case X86::PCMPESTRM128REG:
26495   case X86::VPCMPESTRM128REG:
26496   case X86::PCMPESTRM128MEM:
26497   case X86::VPCMPESTRM128MEM:
26498     assert(Subtarget.hasSSE42() &&
26499            "Target must have SSE4.2 or AVX features enabled");
26500     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26501
26502   // String/text processing lowering.
26503   case X86::PCMPISTRIREG:
26504   case X86::VPCMPISTRIREG:
26505   case X86::PCMPISTRIMEM:
26506   case X86::VPCMPISTRIMEM:
26507   case X86::PCMPESTRIREG:
26508   case X86::VPCMPESTRIREG:
26509   case X86::PCMPESTRIMEM:
26510   case X86::VPCMPESTRIMEM:
26511     assert(Subtarget.hasSSE42() &&
26512            "Target must have SSE4.2 or AVX features enabled");
26513     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26514
26515   // Thread synchronization.
26516   case X86::MONITOR:
26517     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26518   case X86::MONITORX:
26519     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26520
26521   // Cache line zero
26522   case X86::CLZERO:
26523     return emitClzero(&MI, BB, Subtarget);
26524
26525   // PKU feature
26526   case X86::WRPKRU:
26527     return emitWRPKRU(MI, BB, Subtarget);
26528   case X86::RDPKRU:
26529     return emitRDPKRU(MI, BB, Subtarget);
26530   // xbegin
26531   case X86::XBEGIN:
26532     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26533
26534   case X86::VASTART_SAVE_XMM_REGS:
26535     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26536
26537   case X86::VAARG_64:
26538     return EmitVAARG64WithCustomInserter(MI, BB);
26539
26540   case X86::EH_SjLj_SetJmp32:
26541   case X86::EH_SjLj_SetJmp64:
26542     return emitEHSjLjSetJmp(MI, BB);
26543
26544   case X86::EH_SjLj_LongJmp32:
26545   case X86::EH_SjLj_LongJmp64:
26546     return emitEHSjLjLongJmp(MI, BB);
26547
26548   case X86::Int_eh_sjlj_setup_dispatch:
26549     return EmitSjLjDispatchBlock(MI, BB);
26550
26551   case TargetOpcode::STATEPOINT:
26552     // As an implementation detail, STATEPOINT shares the STACKMAP format at
26553     // this point in the process.  We diverge later.
26554     return emitPatchPoint(MI, BB);
26555
26556   case TargetOpcode::STACKMAP:
26557   case TargetOpcode::PATCHPOINT:
26558     return emitPatchPoint(MI, BB);
26559
26560   case TargetOpcode::PATCHABLE_EVENT_CALL:
26561     // Do nothing here, handle in xray instrumentation pass.
26562     return BB;
26563
26564   case X86::LCMPXCHG8B: {
26565     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26566     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26567     // requires a memory operand. If it happens that current architecture is
26568     // i686 and for current function we need a base pointer
26569     // - which is ESI for i686 - register allocator would not be able to
26570     // allocate registers for an address in form of X(%reg, %reg, Y)
26571     // - there never would be enough unreserved registers during regalloc
26572     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26573     // We are giving a hand to register allocator by precomputing the address in
26574     // a new vreg using LEA.
26575
26576     // If it is not i686 or there is no base pointer - nothing to do here.
26577     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26578       return BB;
26579
26580     // Even though this code does not necessarily needs the base pointer to
26581     // be ESI, we check for that. The reason: if this assert fails, there are
26582     // some changes happened in the compiler base pointer handling, which most
26583     // probably have to be addressed somehow here.
26584     assert(TRI->getBaseRegister() == X86::ESI &&
26585            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26586            "base pointer in mind");
26587
26588     MachineRegisterInfo &MRI = MF->getRegInfo();
26589     MVT SPTy = getPointerTy(MF->getDataLayout());
26590     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26591     unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26592
26593     X86AddressMode AM = getAddressFromInstr(&MI, 0);
26594     // Regalloc does not need any help when the memory operand of CMPXCHG8B
26595     // does not use index register.
26596     if (AM.IndexReg == X86::NoRegister)
26597       return BB;
26598
26599     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26600     // four operand definitions that are E[ABCD] registers. We skip them and
26601     // then insert the LEA.
26602     MachineBasicBlock::iterator MBBI(MI);
26603     while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26604            MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26605       --MBBI;
26606     addFullAddress(
26607         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26608
26609     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26610
26611     return BB;
26612   }
26613   case X86::LCMPXCHG16B:
26614     return BB;
26615   case X86::LCMPXCHG8B_SAVE_EBX:
26616   case X86::LCMPXCHG16B_SAVE_RBX: {
26617     unsigned BasePtr =
26618         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26619     if (!BB->isLiveIn(BasePtr))
26620       BB->addLiveIn(BasePtr);
26621     return BB;
26622   }
26623   }
26624 }
26625
26626 //===----------------------------------------------------------------------===//
26627 //                           X86 Optimization Hooks
26628 //===----------------------------------------------------------------------===//
26629
26630 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26631                                                       KnownBits &Known,
26632                                                       const APInt &DemandedElts,
26633                                                       const SelectionDAG &DAG,
26634                                                       unsigned Depth) const {
26635   unsigned BitWidth = Known.getBitWidth();
26636   unsigned Opc = Op.getOpcode();
26637   EVT VT = Op.getValueType();
26638   assert((Opc >= ISD::BUILTIN_OP_END ||
26639           Opc == ISD::INTRINSIC_WO_CHAIN ||
26640           Opc == ISD::INTRINSIC_W_CHAIN ||
26641           Opc == ISD::INTRINSIC_VOID) &&
26642          "Should use MaskedValueIsZero if you don't know whether Op"
26643          " is a target node!");
26644
26645   Known.resetAll();
26646   switch (Opc) {
26647   default: break;
26648   case X86ISD::ADD:
26649   case X86ISD::SUB:
26650   case X86ISD::ADC:
26651   case X86ISD::SBB:
26652   case X86ISD::SMUL:
26653   case X86ISD::UMUL:
26654   case X86ISD::INC:
26655   case X86ISD::DEC:
26656   case X86ISD::OR:
26657   case X86ISD::XOR:
26658   case X86ISD::AND:
26659     // These nodes' second result is a boolean.
26660     if (Op.getResNo() == 0)
26661       break;
26662     LLVM_FALLTHROUGH;
26663   case X86ISD::SETCC:
26664     Known.Zero.setBitsFrom(1);
26665     break;
26666   case X86ISD::MOVMSK: {
26667     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26668     Known.Zero.setBitsFrom(NumLoBits);
26669     break;
26670   }
26671   case X86ISD::VSHLI:
26672   case X86ISD::VSRLI: {
26673     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26674       if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26675         Known.setAllZero();
26676         break;
26677       }
26678
26679       DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26680       unsigned ShAmt = ShiftImm->getZExtValue();
26681       if (Opc == X86ISD::VSHLI) {
26682         Known.Zero <<= ShAmt;
26683         Known.One <<= ShAmt;
26684         // Low bits are known zero.
26685         Known.Zero.setLowBits(ShAmt);
26686       } else {
26687         Known.Zero.lshrInPlace(ShAmt);
26688         Known.One.lshrInPlace(ShAmt);
26689         // High bits are known zero.
26690         Known.Zero.setHighBits(ShAmt);
26691       }
26692     }
26693     break;
26694   }
26695   case X86ISD::VZEXT: {
26696     SDValue N0 = Op.getOperand(0);
26697     unsigned NumElts = VT.getVectorNumElements();
26698
26699     EVT SrcVT = N0.getValueType();
26700     unsigned InNumElts = SrcVT.getVectorNumElements();
26701     unsigned InBitWidth = SrcVT.getScalarSizeInBits();
26702     assert(InNumElts >= NumElts && "Illegal VZEXT input");
26703
26704     Known = KnownBits(InBitWidth);
26705     APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
26706     DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
26707     Known = Known.zext(BitWidth);
26708     Known.Zero.setBitsFrom(InBitWidth);
26709     break;
26710   }
26711   }
26712 }
26713
26714 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26715     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
26716     unsigned Depth) const {
26717   unsigned VTBits = Op.getScalarValueSizeInBits();
26718   unsigned Opcode = Op.getOpcode();
26719   switch (Opcode) {
26720   case X86ISD::SETCC_CARRY:
26721     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26722     return VTBits;
26723
26724   case X86ISD::VSEXT: {
26725     SDValue Src = Op.getOperand(0);
26726     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26727     Tmp += VTBits - Src.getScalarValueSizeInBits();
26728     return Tmp;
26729   }
26730
26731   case X86ISD::VSRAI: {
26732     SDValue Src = Op.getOperand(0);
26733     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
26734     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
26735     ShiftVal += Tmp;
26736     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
26737   }
26738
26739   case X86ISD::PCMPGT:
26740   case X86ISD::PCMPEQ:
26741   case X86ISD::CMPP:
26742   case X86ISD::VPCOM:
26743   case X86ISD::VPCOMU:
26744     // Vector compares return zero/all-bits result values.
26745     return VTBits;
26746   }
26747
26748   // Fallback case.
26749   return 1;
26750 }
26751
26752 /// Returns true (and the GlobalValue and the offset) if the node is a
26753 /// GlobalAddress + offset.
26754 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26755                                        const GlobalValue* &GA,
26756                                        int64_t &Offset) const {
26757   if (N->getOpcode() == X86ISD::Wrapper) {
26758     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26759       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26760       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26761       return true;
26762     }
26763   }
26764   return TargetLowering::isGAPlusOffset(N, GA, Offset);
26765 }
26766
26767 // Attempt to match a combined shuffle mask against supported unary shuffle
26768 // instructions.
26769 // TODO: Investigate sharing more of this with shuffle lowering.
26770 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26771                                     bool AllowFloatDomain, bool AllowIntDomain,
26772                                     SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
26773                                     const X86Subtarget &Subtarget,
26774                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26775   unsigned NumMaskElts = Mask.size();
26776   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26777
26778   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
26779   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
26780   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
26781                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
26782     unsigned MaxScale = 64 / MaskEltSize;
26783     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26784       bool Match = true;
26785       unsigned NumDstElts = NumMaskElts / Scale;
26786       for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26787         Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26788         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26789       }
26790       if (Match) {
26791         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
26792         SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
26793         if (SrcVT != MaskVT)
26794           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
26795         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26796         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26797         Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
26798                                   : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
26799         return true;
26800       }
26801     }
26802   }
26803
26804   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26805   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26806       isUndefOrEqual(Mask[0], 0) &&
26807       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26808     Shuffle = X86ISD::VZEXT_MOVL;
26809     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26810     return true;
26811   }
26812
26813   // Check if we have SSE3 which will let us use MOVDDUP etc. The
26814   // instructions are no slower than UNPCKLPD but has the option to
26815   // fold the input operand into even an unaligned memory load.
26816   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
26817     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26818       Shuffle = X86ISD::MOVDDUP;
26819       SrcVT = DstVT = MVT::v2f64;
26820       return true;
26821     }
26822     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26823       Shuffle = X86ISD::MOVSLDUP;
26824       SrcVT = DstVT = MVT::v4f32;
26825       return true;
26826     }
26827     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26828       Shuffle = X86ISD::MOVSHDUP;
26829       SrcVT = DstVT = MVT::v4f32;
26830       return true;
26831     }
26832   }
26833
26834   if (MaskVT.is256BitVector() && AllowFloatDomain) {
26835     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26836     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26837       Shuffle = X86ISD::MOVDDUP;
26838       SrcVT = DstVT = MVT::v4f64;
26839       return true;
26840     }
26841     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26842       Shuffle = X86ISD::MOVSLDUP;
26843       SrcVT = DstVT = MVT::v8f32;
26844       return true;
26845     }
26846     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26847       Shuffle = X86ISD::MOVSHDUP;
26848       SrcVT = DstVT = MVT::v8f32;
26849       return true;
26850     }
26851   }
26852
26853   if (MaskVT.is512BitVector() && AllowFloatDomain) {
26854     assert(Subtarget.hasAVX512() &&
26855            "AVX512 required for 512-bit vector shuffles");
26856     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26857       Shuffle = X86ISD::MOVDDUP;
26858       SrcVT = DstVT = MVT::v8f64;
26859       return true;
26860     }
26861     if (isTargetShuffleEquivalent(
26862             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26863       Shuffle = X86ISD::MOVSLDUP;
26864       SrcVT = DstVT = MVT::v16f32;
26865       return true;
26866     }
26867     if (isTargetShuffleEquivalent(
26868             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26869       Shuffle = X86ISD::MOVSHDUP;
26870       SrcVT = DstVT = MVT::v16f32;
26871       return true;
26872     }
26873   }
26874
26875   // Attempt to match against broadcast-from-vector.
26876   if (Subtarget.hasAVX2()) {
26877     SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26878     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26879       SrcVT = DstVT = MaskVT;
26880       Shuffle = X86ISD::VBROADCAST;
26881       return true;
26882     }
26883   }
26884
26885   return false;
26886 }
26887
26888 // Attempt to match a combined shuffle mask against supported unary immediate
26889 // permute instructions.
26890 // TODO: Investigate sharing more of this with shuffle lowering.
26891 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26892                                            bool AllowFloatDomain,
26893                                            bool AllowIntDomain,
26894                                            const X86Subtarget &Subtarget,
26895                                            unsigned &Shuffle, MVT &ShuffleVT,
26896                                            unsigned &PermuteImm) {
26897   unsigned NumMaskElts = Mask.size();
26898
26899   bool ContainsZeros = false;
26900   APInt Zeroable(NumMaskElts, false);
26901   for (unsigned i = 0; i != NumMaskElts; ++i) {
26902     int M = Mask[i];
26903     if (isUndefOrZero(M))
26904       Zeroable.setBit(i);
26905     ContainsZeros |= (M == SM_SentinelZero);
26906   }
26907
26908   // Attempt to match against byte/bit shifts.
26909   // FIXME: Add 512-bit support.
26910   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26911                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26912     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26913                                              MaskVT.getScalarSizeInBits(), Mask,
26914                                              0, Zeroable, Subtarget);
26915     if (0 < ShiftAmt) {
26916       PermuteImm = (unsigned)ShiftAmt;
26917       return true;
26918     }
26919   }
26920
26921   // Ensure we don't contain any zero elements.
26922   if (ContainsZeros)
26923     return false;
26924
26925   assert(llvm::all_of(Mask, [&](int M) {
26926                         return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26927                       }) && "Expected unary shuffle");
26928
26929   unsigned InputSizeInBits = MaskVT.getSizeInBits();
26930   unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26931   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26932
26933   // Handle PSHUFLW/PSHUFHW repeated patterns.
26934   if (MaskScalarSizeInBits == 16) {
26935     SmallVector<int, 4> RepeatedMask;
26936     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26937       ArrayRef<int> LoMask(Mask.data() + 0, 4);
26938       ArrayRef<int> HiMask(Mask.data() + 4, 4);
26939
26940       // PSHUFLW: permute lower 4 elements only.
26941       if (isUndefOrInRange(LoMask, 0, 4) &&
26942           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26943         Shuffle = X86ISD::PSHUFLW;
26944         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26945         PermuteImm = getV4X86ShuffleImm(LoMask);
26946         return true;
26947       }
26948
26949       // PSHUFHW: permute upper 4 elements only.
26950       if (isUndefOrInRange(HiMask, 4, 8) &&
26951           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26952         // Offset the HiMask so that we can create the shuffle immediate.
26953         int OffsetHiMask[4];
26954         for (int i = 0; i != 4; ++i)
26955           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26956
26957         Shuffle = X86ISD::PSHUFHW;
26958         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26959         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26960         return true;
26961       }
26962
26963       return false;
26964     }
26965     return false;
26966   }
26967
26968   // We only support permutation of 32/64 bit elements after this.
26969   if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26970     return false;
26971
26972   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26973   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26974   if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
26975     return false;
26976
26977   // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26978   if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
26979     AllowFloatDomain = true;
26980     AllowIntDomain = false;
26981   }
26982
26983   // Check for lane crossing permutes.
26984   if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26985     // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26986     if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26987       Shuffle = X86ISD::VPERMI;
26988       ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
26989       PermuteImm = getV4X86ShuffleImm(Mask);
26990       return true;
26991     }
26992     if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26993       SmallVector<int, 4> RepeatedMask;
26994       if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26995         Shuffle = X86ISD::VPERMI;
26996         ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
26997         PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26998         return true;
26999       }
27000     }
27001     return false;
27002   }
27003
27004   // VPERMILPD can permute with a non-repeating shuffle.
27005   if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
27006     Shuffle = X86ISD::VPERMILPI;
27007     ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27008     PermuteImm = 0;
27009     for (int i = 0, e = Mask.size(); i != e; ++i) {
27010       int M = Mask[i];
27011       if (M == SM_SentinelUndef)
27012         continue;
27013       assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27014       PermuteImm |= (M & 1) << i;
27015     }
27016     return true;
27017   }
27018
27019   // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
27020   SmallVector<int, 4> RepeatedMask;
27021   if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
27022     return false;
27023
27024   // Narrow the repeated mask for 32-bit element permutes.
27025   SmallVector<int, 4> WordMask = RepeatedMask;
27026   if (MaskScalarSizeInBits == 64)
27027     scaleShuffleMask(2, RepeatedMask, WordMask);
27028
27029   Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
27030   ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
27031   ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27032   PermuteImm = getV4X86ShuffleImm(WordMask);
27033   return true;
27034 }
27035
27036 // Attempt to match a combined unary shuffle mask against supported binary
27037 // shuffle instructions.
27038 // TODO: Investigate sharing more of this with shuffle lowering.
27039 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27040                                      bool AllowFloatDomain, bool AllowIntDomain,
27041                                      SDValue &V1, SDValue &V2, SDLoc &DL,
27042                                      SelectionDAG &DAG,
27043                                      const X86Subtarget &Subtarget,
27044                                      unsigned &Shuffle, MVT &ShuffleVT,
27045                                      bool IsUnary) {
27046   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27047
27048   if (MaskVT.is128BitVector()) {
27049     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27050       V2 = V1;
27051       Shuffle = X86ISD::MOVLHPS;
27052       ShuffleVT = MVT::v4f32;
27053       return true;
27054     }
27055     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27056       V2 = V1;
27057       Shuffle = X86ISD::MOVHLPS;
27058       ShuffleVT = MVT::v4f32;
27059       return true;
27060     }
27061     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27062         (AllowFloatDomain || !Subtarget.hasSSE41())) {
27063       std::swap(V1, V2);
27064       Shuffle = X86ISD::MOVSD;
27065       ShuffleVT = MaskVT;
27066       return true;
27067     }
27068     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27069         (AllowFloatDomain || !Subtarget.hasSSE41())) {
27070       Shuffle = X86ISD::MOVSS;
27071       ShuffleVT = MaskVT;
27072       return true;
27073     }
27074   }
27075
27076   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27077   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27078       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27079       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27080       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27081       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27082     if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27083                                     DAG, Subtarget)) {
27084       ShuffleVT = MaskVT;
27085       if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27086         ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27087       return true;
27088     }
27089   }
27090
27091   return false;
27092 }
27093
27094 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27095                                             bool AllowFloatDomain,
27096                                             bool AllowIntDomain,
27097                                             SDValue &V1, SDValue &V2, SDLoc &DL,
27098                                             SelectionDAG &DAG,
27099                                             const X86Subtarget &Subtarget,
27100                                             unsigned &Shuffle, MVT &ShuffleVT,
27101                                             unsigned &PermuteImm) {
27102   unsigned NumMaskElts = Mask.size();
27103   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27104
27105   // Attempt to match against PALIGNR byte rotate.
27106   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27107                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27108     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27109     if (0 < ByteRotation) {
27110       Shuffle = X86ISD::PALIGNR;
27111       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27112       PermuteImm = ByteRotation;
27113       return true;
27114     }
27115   }
27116
27117   // Attempt to combine to X86ISD::BLENDI.
27118   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27119                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27120       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27121     uint64_t BlendMask = 0;
27122     bool ForceV1Zero = false, ForceV2Zero = false;
27123     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27124     if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27125                                   BlendMask)) {
27126       if (MaskVT == MVT::v16i16) {
27127         // We can only use v16i16 PBLENDW if the lanes are repeated.
27128         SmallVector<int, 8> RepeatedMask;
27129         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27130                                         RepeatedMask)) {
27131           assert(RepeatedMask.size() == 8 &&
27132                  "Repeated mask size doesn't match!");
27133           PermuteImm = 0;
27134           for (int i = 0; i < 8; ++i)
27135             if (RepeatedMask[i] >= 8)
27136               PermuteImm |= 1 << i;
27137           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27138           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27139           Shuffle = X86ISD::BLENDI;
27140           ShuffleVT = MaskVT;
27141           return true;
27142         }
27143       } else {
27144         // Determine a type compatible with X86ISD::BLENDI.
27145         ShuffleVT = MaskVT;
27146         if (Subtarget.hasAVX2()) {
27147           if (ShuffleVT == MVT::v4i64)
27148             ShuffleVT = MVT::v8i32;
27149           else if (ShuffleVT == MVT::v2i64)
27150             ShuffleVT = MVT::v4i32;
27151         } else {
27152           if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27153             ShuffleVT = MVT::v8i16;
27154           else if (ShuffleVT == MVT::v4i64)
27155             ShuffleVT = MVT::v4f64;
27156           else if (ShuffleVT == MVT::v8i32)
27157             ShuffleVT = MVT::v8f32;
27158         }
27159
27160         if (!ShuffleVT.isFloatingPoint()) {
27161           int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27162           BlendMask =
27163               scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27164           ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27165           ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27166         }
27167
27168         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27169         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27170         PermuteImm = (unsigned)BlendMask;
27171         Shuffle = X86ISD::BLENDI;
27172         return true;
27173       }
27174     }
27175   }
27176
27177   // Attempt to combine to INSERTPS.
27178   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27179       MaskVT.is128BitVector()) {
27180     APInt Zeroable(4, 0);
27181     for (unsigned i = 0; i != NumMaskElts; ++i)
27182       if (Mask[i] < 0)
27183         Zeroable.setBit(i);
27184
27185     if (Zeroable.getBoolValue() &&
27186         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27187       Shuffle = X86ISD::INSERTPS;
27188       ShuffleVT = MVT::v4f32;
27189       return true;
27190     }
27191   }
27192
27193   // Attempt to combine to SHUFPD.
27194   if (AllowFloatDomain && EltSizeInBits == 64 &&
27195       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27196        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27197        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27198     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27199       Shuffle = X86ISD::SHUFP;
27200       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27201       return true;
27202     }
27203   }
27204
27205   // Attempt to combine to SHUFPS.
27206   if (AllowFloatDomain && EltSizeInBits == 32 &&
27207       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27208        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27209        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27210     SmallVector<int, 4> RepeatedMask;
27211     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27212       // Match each half of the repeated mask, to determine if its just
27213       // referencing one of the vectors, is zeroable or entirely undef.
27214       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27215         int M0 = RepeatedMask[Offset];
27216         int M1 = RepeatedMask[Offset + 1];
27217
27218         if (isUndefInRange(RepeatedMask, Offset, 2)) {
27219           return DAG.getUNDEF(MaskVT);
27220         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27221           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27222           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27223           return getZeroVector(MaskVT, Subtarget, DAG, DL);
27224         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27225           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27226           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27227           return V1;
27228         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27229           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27230           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27231           return V2;
27232         }
27233
27234         return SDValue();
27235       };
27236
27237       int ShufMask[4] = {-1, -1, -1, -1};
27238       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27239       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27240
27241       if (Lo && Hi) {
27242         V1 = Lo;
27243         V2 = Hi;
27244         Shuffle = X86ISD::SHUFP;
27245         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27246         PermuteImm = getV4X86ShuffleImm(ShufMask);
27247         return true;
27248       }
27249     }
27250   }
27251
27252   return false;
27253 }
27254
27255 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27256 /// possible.
27257 ///
27258 /// This is the leaf of the recursive combine below. When we have found some
27259 /// chain of single-use x86 shuffle instructions and accumulated the combined
27260 /// shuffle mask represented by them, this will try to pattern match that mask
27261 /// into either a single instruction if there is a special purpose instruction
27262 /// for this operation, or into a PSHUFB instruction which is a fully general
27263 /// instruction but should only be used to replace chains over a certain depth.
27264 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27265                                    ArrayRef<int> BaseMask, int Depth,
27266                                    bool HasVariableMask, SelectionDAG &DAG,
27267                                    TargetLowering::DAGCombinerInfo &DCI,
27268                                    const X86Subtarget &Subtarget) {
27269   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27270   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27271          "Unexpected number of shuffle inputs!");
27272
27273   // Find the inputs that enter the chain. Note that multiple uses are OK
27274   // here, we're not going to remove the operands we find.
27275   bool UnaryShuffle = (Inputs.size() == 1);
27276   SDValue V1 = peekThroughBitcasts(Inputs[0]);
27277   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27278                              : peekThroughBitcasts(Inputs[1]));
27279
27280   MVT VT1 = V1.getSimpleValueType();
27281   MVT VT2 = V2.getSimpleValueType();
27282   MVT RootVT = Root.getSimpleValueType();
27283   assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27284          VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27285          "Vector size mismatch");
27286
27287   SDLoc DL(Root);
27288   SDValue Res;
27289
27290   unsigned NumBaseMaskElts = BaseMask.size();
27291   if (NumBaseMaskElts == 1) {
27292     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27293     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27294                   /*AddTo*/ true);
27295     return true;
27296   }
27297
27298   unsigned RootSizeInBits = RootVT.getSizeInBits();
27299   unsigned NumRootElts = RootVT.getVectorNumElements();
27300   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27301   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27302                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27303
27304   // Don't combine if we are a AVX512/EVEX target and the mask element size
27305   // is different from the root element size - this would prevent writemasks
27306   // from being reused.
27307   // TODO - this currently prevents all lane shuffles from occurring.
27308   // TODO - check for writemasks usage instead of always preventing combining.
27309   // TODO - attempt to narrow Mask back to writemask size.
27310   bool IsEVEXShuffle =
27311       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27312   if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27313     return false;
27314
27315   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27316
27317   // Handle 128-bit lane shuffles of 256-bit vectors.
27318   // TODO - this should support binary shuffles.
27319   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27320       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27321     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27322       return false; // Nothing to do!
27323     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27324     unsigned PermMask = 0;
27325     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27326     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27327
27328     Res = DAG.getBitcast(ShuffleVT, V1);
27329     DCI.AddToWorklist(Res.getNode());
27330     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27331                       DAG.getUNDEF(ShuffleVT),
27332                       DAG.getConstant(PermMask, DL, MVT::i8));
27333     DCI.AddToWorklist(Res.getNode());
27334     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27335                   /*AddTo*/ true);
27336     return true;
27337   }
27338
27339   // For masks that have been widened to 128-bit elements or more,
27340   // narrow back down to 64-bit elements.
27341   SmallVector<int, 64> Mask;
27342   if (BaseMaskEltSizeInBits > 64) {
27343     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27344     int MaskScale = BaseMaskEltSizeInBits / 64;
27345     scaleShuffleMask(MaskScale, BaseMask, Mask);
27346   } else {
27347     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27348   }
27349
27350   unsigned NumMaskElts = Mask.size();
27351   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27352
27353   // Determine the effective mask value type.
27354   FloatDomain &= (32 <= MaskEltSizeInBits);
27355   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27356                            : MVT::getIntegerVT(MaskEltSizeInBits);
27357   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27358
27359   // Only allow legal mask types.
27360   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27361     return false;
27362
27363   // Attempt to match the mask against known shuffle patterns.
27364   MVT ShuffleSrcVT, ShuffleVT;
27365   unsigned Shuffle, PermuteImm;
27366
27367   // Which shuffle domains are permitted?
27368   // Permit domain crossing at higher combine depths.
27369   bool AllowFloatDomain = FloatDomain || (Depth > 3);
27370   bool AllowIntDomain = !FloatDomain || (Depth > 3);
27371
27372   if (UnaryShuffle) {
27373     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27374     // directly if we don't shuffle the lower element and we shuffle the upper
27375     // (zero) elements within themselves.
27376     if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27377         (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27378       unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27379       ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27380       if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27381           isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27382         DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27383                       /*AddTo*/ true);
27384         return true;
27385       }
27386     }
27387
27388     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27389                                 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27390                                 ShuffleVT)) {
27391       if (Depth == 1 && Root.getOpcode() == Shuffle)
27392         return false; // Nothing to do!
27393       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27394         return false; // AVX512 Writemask clash.
27395       Res = DAG.getBitcast(ShuffleSrcVT, V1);
27396       DCI.AddToWorklist(Res.getNode());
27397       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27398       DCI.AddToWorklist(Res.getNode());
27399       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27400                     /*AddTo*/ true);
27401       return true;
27402     }
27403
27404     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27405                                        AllowIntDomain, Subtarget, Shuffle,
27406                                        ShuffleVT, PermuteImm)) {
27407       if (Depth == 1 && Root.getOpcode() == Shuffle)
27408         return false; // Nothing to do!
27409       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27410         return false; // AVX512 Writemask clash.
27411       Res = DAG.getBitcast(ShuffleVT, V1);
27412       DCI.AddToWorklist(Res.getNode());
27413       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27414                         DAG.getConstant(PermuteImm, DL, MVT::i8));
27415       DCI.AddToWorklist(Res.getNode());
27416       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27417                     /*AddTo*/ true);
27418       return true;
27419     }
27420   }
27421
27422   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27423                                V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27424                                UnaryShuffle)) {
27425     if (Depth == 1 && Root.getOpcode() == Shuffle)
27426       return false; // Nothing to do!
27427     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27428       return false; // AVX512 Writemask clash.
27429     V1 = DAG.getBitcast(ShuffleVT, V1);
27430     DCI.AddToWorklist(V1.getNode());
27431     V2 = DAG.getBitcast(ShuffleVT, V2);
27432     DCI.AddToWorklist(V2.getNode());
27433     Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27434     DCI.AddToWorklist(Res.getNode());
27435     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27436                   /*AddTo*/ true);
27437     return true;
27438   }
27439
27440   if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
27441                                       AllowIntDomain, V1, V2, DL, DAG,
27442                                       Subtarget, Shuffle, ShuffleVT,
27443                                       PermuteImm)) {
27444     if (Depth == 1 && Root.getOpcode() == Shuffle)
27445       return false; // Nothing to do!
27446     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27447       return false; // AVX512 Writemask clash.
27448     V1 = DAG.getBitcast(ShuffleVT, V1);
27449     DCI.AddToWorklist(V1.getNode());
27450     V2 = DAG.getBitcast(ShuffleVT, V2);
27451     DCI.AddToWorklist(V2.getNode());
27452     Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27453                       DAG.getConstant(PermuteImm, DL, MVT::i8));
27454     DCI.AddToWorklist(Res.getNode());
27455     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27456                   /*AddTo*/ true);
27457     return true;
27458   }
27459
27460   // Don't try to re-form single instruction chains under any circumstances now
27461   // that we've done encoding canonicalization for them.
27462   if (Depth < 2)
27463     return false;
27464
27465   bool MaskContainsZeros =
27466       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27467
27468   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27469     // If we have a single input lane-crossing shuffle then lower to VPERMV.
27470     if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27471         ((Subtarget.hasAVX2() &&
27472           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27473          (Subtarget.hasAVX512() &&
27474           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27475            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27476          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27477          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27478          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27479          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27480       MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27481       MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27482       SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27483       DCI.AddToWorklist(VPermMask.getNode());
27484       Res = DAG.getBitcast(MaskVT, V1);
27485       DCI.AddToWorklist(Res.getNode());
27486       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27487       DCI.AddToWorklist(Res.getNode());
27488       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27489                     /*AddTo*/ true);
27490       return true;
27491     }
27492
27493     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27494     // vector as the second source.
27495     if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27496         ((Subtarget.hasAVX512() &&
27497           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27498            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27499          (Subtarget.hasVLX() &&
27500           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27501            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27502          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27503          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27504          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27505          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27506       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27507       for (unsigned i = 0; i != NumMaskElts; ++i)
27508         if (Mask[i] == SM_SentinelZero)
27509           Mask[i] = NumMaskElts + i;
27510
27511       MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27512       MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27513       SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27514       DCI.AddToWorklist(VPermMask.getNode());
27515       Res = DAG.getBitcast(MaskVT, V1);
27516       DCI.AddToWorklist(Res.getNode());
27517       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27518       DCI.AddToWorklist(Zero.getNode());
27519       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27520       DCI.AddToWorklist(Res.getNode());
27521       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27522                     /*AddTo*/ true);
27523       return true;
27524     }
27525
27526     // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27527     if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27528         ((Subtarget.hasAVX512() &&
27529           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27530            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27531          (Subtarget.hasVLX() &&
27532           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27533            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27534          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27535          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27536          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27537          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27538       MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27539       MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27540       SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27541       DCI.AddToWorklist(VPermMask.getNode());
27542       V1 = DAG.getBitcast(MaskVT, V1);
27543       DCI.AddToWorklist(V1.getNode());
27544       V2 = DAG.getBitcast(MaskVT, V2);
27545       DCI.AddToWorklist(V2.getNode());
27546       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27547       DCI.AddToWorklist(Res.getNode());
27548       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27549                     /*AddTo*/ true);
27550       return true;
27551     }
27552     return false;
27553   }
27554
27555   // See if we can combine a single input shuffle with zeros to a bit-mask,
27556   // which is much simpler than any shuffle.
27557   if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27558       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27559       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27560     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27561     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27562     APInt UndefElts(NumMaskElts, 0);
27563     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27564     for (unsigned i = 0; i != NumMaskElts; ++i) {
27565       int M = Mask[i];
27566       if (M == SM_SentinelUndef) {
27567         UndefElts.setBit(i);
27568         continue;
27569       }
27570       if (M == SM_SentinelZero)
27571         continue;
27572       EltBits[i] = AllOnes;
27573     }
27574     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27575     DCI.AddToWorklist(BitMask.getNode());
27576     Res = DAG.getBitcast(MaskVT, V1);
27577     DCI.AddToWorklist(Res.getNode());
27578     unsigned AndOpcode =
27579         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27580     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27581     DCI.AddToWorklist(Res.getNode());
27582     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27583                   /*AddTo*/ true);
27584     return true;
27585   }
27586
27587   // If we have a single input shuffle with different shuffle patterns in the
27588   // the 128-bit lanes use the variable mask to VPERMILPS.
27589   // TODO Combine other mask types at higher depths.
27590   if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27591       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27592        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27593     SmallVector<SDValue, 16> VPermIdx;
27594     for (int M : Mask) {
27595       SDValue Idx =
27596           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27597       VPermIdx.push_back(Idx);
27598     }
27599     MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27600     SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27601     DCI.AddToWorklist(VPermMask.getNode());
27602     Res = DAG.getBitcast(MaskVT, V1);
27603     DCI.AddToWorklist(Res.getNode());
27604     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27605     DCI.AddToWorklist(Res.getNode());
27606     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27607                   /*AddTo*/ true);
27608     return true;
27609   }
27610
27611   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27612   // to VPERMIL2PD/VPERMIL2PS.
27613   if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27614       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27615        MaskVT == MVT::v8f32)) {
27616     // VPERMIL2 Operation.
27617     // Bits[3] - Match Bit.
27618     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27619     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27620     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27621     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27622     SmallVector<int, 8> VPerm2Idx;
27623     MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27624     MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27625     unsigned M2ZImm = 0;
27626     for (int M : Mask) {
27627       if (M == SM_SentinelUndef) {
27628         VPerm2Idx.push_back(-1);
27629         continue;
27630       }
27631       if (M == SM_SentinelZero) {
27632         M2ZImm = 2;
27633         VPerm2Idx.push_back(8);
27634         continue;
27635       }
27636       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27637       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27638       VPerm2Idx.push_back(Index);
27639     }
27640     V1 = DAG.getBitcast(MaskVT, V1);
27641     DCI.AddToWorklist(V1.getNode());
27642     V2 = DAG.getBitcast(MaskVT, V2);
27643     DCI.AddToWorklist(V2.getNode());
27644     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27645     DCI.AddToWorklist(VPerm2MaskOp.getNode());
27646     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27647                       DAG.getConstant(M2ZImm, DL, MVT::i8));
27648     DCI.AddToWorklist(Res.getNode());
27649     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27650                   /*AddTo*/ true);
27651     return true;
27652   }
27653
27654   // If we have 3 or more shuffle instructions or a chain involving a variable
27655   // mask, we can replace them with a single PSHUFB instruction profitably.
27656   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27657   // instructions, but in practice PSHUFB tends to be *very* fast so we're
27658   // more aggressive.
27659   if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27660       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27661        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27662        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27663     SmallVector<SDValue, 16> PSHUFBMask;
27664     int NumBytes = RootVT.getSizeInBits() / 8;
27665     int Ratio = NumBytes / NumMaskElts;
27666     for (int i = 0; i < NumBytes; ++i) {
27667       int M = Mask[i / Ratio];
27668       if (M == SM_SentinelUndef) {
27669         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27670         continue;
27671       }
27672       if (M == SM_SentinelZero) {
27673         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27674         continue;
27675       }
27676       M = Ratio * M + i % Ratio;
27677       assert ((M / 16) == (i / 16) && "Lane crossing detected");
27678       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27679     }
27680     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27681     Res = DAG.getBitcast(ByteVT, V1);
27682     DCI.AddToWorklist(Res.getNode());
27683     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27684     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27685     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27686     DCI.AddToWorklist(Res.getNode());
27687     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27688                   /*AddTo*/ true);
27689     return true;
27690   }
27691
27692   // With XOP, if we have a 128-bit binary input shuffle we can always combine
27693   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27694   // slower than PSHUFB on targets that support both.
27695   if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27696       Subtarget.hasXOP()) {
27697     // VPPERM Mask Operation
27698     // Bits[4:0] - Byte Index (0 - 31)
27699     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27700     SmallVector<SDValue, 16> VPPERMMask;
27701     int NumBytes = 16;
27702     int Ratio = NumBytes / NumMaskElts;
27703     for (int i = 0; i < NumBytes; ++i) {
27704       int M = Mask[i / Ratio];
27705       if (M == SM_SentinelUndef) {
27706         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27707         continue;
27708       }
27709       if (M == SM_SentinelZero) {
27710         VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27711         continue;
27712       }
27713       M = Ratio * M + i % Ratio;
27714       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27715     }
27716     MVT ByteVT = MVT::v16i8;
27717     V1 = DAG.getBitcast(ByteVT, V1);
27718     DCI.AddToWorklist(V1.getNode());
27719     V2 = DAG.getBitcast(ByteVT, V2);
27720     DCI.AddToWorklist(V2.getNode());
27721     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27722     DCI.AddToWorklist(VPPERMMaskOp.getNode());
27723     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27724     DCI.AddToWorklist(Res.getNode());
27725     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27726                   /*AddTo*/ true);
27727     return true;
27728   }
27729
27730   // Failed to find any combines.
27731   return false;
27732 }
27733
27734 // Attempt to constant fold all of the constant source ops.
27735 // Returns true if the entire shuffle is folded to a constant.
27736 // TODO: Extend this to merge multiple constant Ops and update the mask.
27737 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27738                                         ArrayRef<int> Mask, SDValue Root,
27739                                         bool HasVariableMask, SelectionDAG &DAG,
27740                                         TargetLowering::DAGCombinerInfo &DCI,
27741                                         const X86Subtarget &Subtarget) {
27742   MVT VT = Root.getSimpleValueType();
27743
27744   unsigned SizeInBits = VT.getSizeInBits();
27745   unsigned NumMaskElts = Mask.size();
27746   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27747   unsigned NumOps = Ops.size();
27748
27749   // Extract constant bits from each source op.
27750   bool OneUseConstantOp = false;
27751   SmallVector<APInt, 16> UndefEltsOps(NumOps);
27752   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
27753   for (unsigned i = 0; i != NumOps; ++i) {
27754     SDValue SrcOp = Ops[i];
27755     OneUseConstantOp |= SrcOp.hasOneUse();
27756     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27757                                        RawBitsOps[i]))
27758       return false;
27759   }
27760
27761   // Only fold if at least one of the constants is only used once or
27762   // the combined shuffle has included a variable mask shuffle, this
27763   // is to avoid constant pool bloat.
27764   if (!OneUseConstantOp && !HasVariableMask)
27765     return false;
27766
27767   // Shuffle the constant bits according to the mask.
27768   APInt UndefElts(NumMaskElts, 0);
27769   APInt ZeroElts(NumMaskElts, 0);
27770   APInt ConstantElts(NumMaskElts, 0);
27771   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27772                                         APInt::getNullValue(MaskSizeInBits));
27773   for (unsigned i = 0; i != NumMaskElts; ++i) {
27774     int M = Mask[i];
27775     if (M == SM_SentinelUndef) {
27776       UndefElts.setBit(i);
27777       continue;
27778     } else if (M == SM_SentinelZero) {
27779       ZeroElts.setBit(i);
27780       continue;
27781     }
27782     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27783
27784     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27785     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27786
27787     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27788     if (SrcUndefElts[SrcMaskIdx]) {
27789       UndefElts.setBit(i);
27790       continue;
27791     }
27792
27793     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27794     APInt &Bits = SrcEltBits[SrcMaskIdx];
27795     if (!Bits) {
27796       ZeroElts.setBit(i);
27797       continue;
27798     }
27799
27800     ConstantElts.setBit(i);
27801     ConstantBitData[i] = Bits;
27802   }
27803   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
27804
27805   // Create the constant data.
27806   MVT MaskSVT;
27807   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27808     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27809   else
27810     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27811
27812   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27813
27814   SDLoc DL(Root);
27815   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27816   DCI.AddToWorklist(CstOp.getNode());
27817   DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27818   return true;
27819 }
27820
27821 /// \brief Fully generic combining of x86 shuffle instructions.
27822 ///
27823 /// This should be the last combine run over the x86 shuffle instructions. Once
27824 /// they have been fully optimized, this will recursively consider all chains
27825 /// of single-use shuffle instructions, build a generic model of the cumulative
27826 /// shuffle operation, and check for simpler instructions which implement this
27827 /// operation. We use this primarily for two purposes:
27828 ///
27829 /// 1) Collapse generic shuffles to specialized single instructions when
27830 ///    equivalent. In most cases, this is just an encoding size win, but
27831 ///    sometimes we will collapse multiple generic shuffles into a single
27832 ///    special-purpose shuffle.
27833 /// 2) Look for sequences of shuffle instructions with 3 or more total
27834 ///    instructions, and replace them with the slightly more expensive SSSE3
27835 ///    PSHUFB instruction if available. We do this as the last combining step
27836 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
27837 ///    a suitable short sequence of other instructions. The PSHUFB will either
27838 ///    use a register or have to read from memory and so is slightly (but only
27839 ///    slightly) more expensive than the other shuffle instructions.
27840 ///
27841 /// Because this is inherently a quadratic operation (for each shuffle in
27842 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27843 /// This should never be an issue in practice as the shuffle lowering doesn't
27844 /// produce sequences of more than 8 instructions.
27845 ///
27846 /// FIXME: We will currently miss some cases where the redundant shuffling
27847 /// would simplify under the threshold for PSHUFB formation because of
27848 /// combine-ordering. To fix this, we should do the redundant instruction
27849 /// combining in this recursive walk.
27850 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27851                                           int SrcOpIndex, SDValue Root,
27852                                           ArrayRef<int> RootMask,
27853                                           ArrayRef<const SDNode*> SrcNodes,
27854                                           int Depth, bool HasVariableMask,
27855                                           SelectionDAG &DAG,
27856                                           TargetLowering::DAGCombinerInfo &DCI,
27857                                           const X86Subtarget &Subtarget) {
27858   // Bound the depth of our recursive combine because this is ultimately
27859   // quadratic in nature.
27860   if (Depth > 8)
27861     return false;
27862
27863   // Directly rip through bitcasts to find the underlying operand.
27864   SDValue Op = SrcOps[SrcOpIndex];
27865   Op = peekThroughOneUseBitcasts(Op);
27866
27867   MVT VT = Op.getSimpleValueType();
27868   if (!VT.isVector())
27869     return false; // Bail if we hit a non-vector.
27870
27871   assert(Root.getSimpleValueType().isVector() &&
27872          "Shuffles operate on vector types!");
27873   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27874          "Can only combine shuffles of the same vector register size.");
27875
27876   // Extract target shuffle mask and resolve sentinels and inputs.
27877   SmallVector<int, 64> OpMask;
27878   SmallVector<SDValue, 2> OpInputs;
27879   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
27880     return false;
27881
27882   assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
27883   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
27884   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
27885
27886   // Add the inputs to the Ops list, avoiding duplicates.
27887   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
27888
27889   int InputIdx0 = -1, InputIdx1 = -1;
27890   for (int i = 0, e = Ops.size(); i < e; ++i) {
27891     SDValue BC = peekThroughBitcasts(Ops[i]);
27892     if (Input0 && BC == peekThroughBitcasts(Input0))
27893       InputIdx0 = i;
27894     if (Input1 && BC == peekThroughBitcasts(Input1))
27895       InputIdx1 = i;
27896   }
27897
27898   if (Input0 && InputIdx0 < 0) {
27899     InputIdx0 = SrcOpIndex;
27900     Ops[SrcOpIndex] = Input0;
27901   }
27902   if (Input1 && InputIdx1 < 0) {
27903     InputIdx1 = Ops.size();
27904     Ops.push_back(Input1);
27905   }
27906
27907   assert(((RootMask.size() > OpMask.size() &&
27908            RootMask.size() % OpMask.size() == 0) ||
27909           (OpMask.size() > RootMask.size() &&
27910            OpMask.size() % RootMask.size() == 0) ||
27911           OpMask.size() == RootMask.size()) &&
27912          "The smaller number of elements must divide the larger.");
27913   int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27914   int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27915   int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27916   assert(((RootRatio == 1 && OpRatio == 1) ||
27917           (RootRatio == 1) != (OpRatio == 1)) &&
27918          "Must not have a ratio for both incoming and op masks!");
27919
27920   SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
27921
27922   // Merge this shuffle operation's mask into our accumulated mask. Note that
27923   // this shuffle's mask will be the first applied to the input, followed by the
27924   // root mask to get us all the way to the root value arrangement. The reason
27925   // for this order is that we are recursing up the operation chain.
27926   for (int i = 0; i < MaskWidth; ++i) {
27927     int RootIdx = i / RootRatio;
27928     if (RootMask[RootIdx] < 0) {
27929       // This is a zero or undef lane, we're done.
27930       Mask[i] = RootMask[RootIdx];
27931       continue;
27932     }
27933
27934     int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27935
27936     // Just insert the scaled root mask value if it references an input other
27937     // than the SrcOp we're currently inserting.
27938     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27939         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27940       Mask[i] = RootMaskedIdx;
27941       continue;
27942     }
27943
27944     RootMaskedIdx %= MaskWidth;
27945
27946     int OpIdx = RootMaskedIdx / OpRatio;
27947     if (OpMask[OpIdx] < 0) {
27948       // The incoming lanes are zero or undef, it doesn't matter which ones we
27949       // are using.
27950       Mask[i] = OpMask[OpIdx];
27951       continue;
27952     }
27953
27954     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27955     int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27956     OpMaskedIdx %= MaskWidth;
27957
27958     if (OpMask[OpIdx] < (int)OpMask.size()) {
27959       assert(0 <= InputIdx0 && "Unknown target shuffle input");
27960       OpMaskedIdx += InputIdx0 * MaskWidth;
27961     } else {
27962       assert(0 <= InputIdx1 && "Unknown target shuffle input");
27963       OpMaskedIdx += InputIdx1 * MaskWidth;
27964     }
27965
27966     Mask[i] = OpMaskedIdx;
27967   }
27968
27969   // Handle the all undef/zero cases early.
27970   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27971     DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27972     return true;
27973   }
27974   if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27975     // TODO - should we handle the mixed zero/undef case as well? Just returning
27976     // a zero mask will lose information on undef elements possibly reducing
27977     // future combine possibilities.
27978     DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27979                                                 Subtarget, DAG, SDLoc(Root)));
27980     return true;
27981   }
27982
27983   // Remove unused shuffle source ops.
27984   resolveTargetShuffleInputsAndMask(Ops, Mask);
27985   assert(!Ops.empty() && "Shuffle with no inputs detected");
27986
27987   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27988
27989   // Update the list of shuffle nodes that have been combined so far.
27990   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
27991                                                 SrcNodes.end());
27992   CombinedNodes.push_back(Op.getNode());
27993
27994   // See if we can recurse into each shuffle source op (if it's a target
27995   // shuffle). The source op should only be combined if it either has a
27996   // single use (i.e. current Op) or all its users have already been combined.
27997   for (int i = 0, e = Ops.size(); i < e; ++i)
27998     if (Ops[i].getNode()->hasOneUse() ||
27999         SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28000       if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28001                                         Depth + 1, HasVariableMask, DAG, DCI,
28002                                         Subtarget))
28003         return true;
28004
28005   // Attempt to constant fold all of the constant source ops.
28006   if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28007                                   Subtarget))
28008     return true;
28009
28010   // We can only combine unary and binary shuffle mask cases.
28011   if (Ops.size() > 2)
28012     return false;
28013
28014   // Minor canonicalization of the accumulated shuffle mask to make it easier
28015   // to match below. All this does is detect masks with sequential pairs of
28016   // elements, and shrink them to the half-width mask. It does this in a loop
28017   // so it will reduce the size of the mask to the minimal width mask which
28018   // performs an equivalent shuffle.
28019   SmallVector<int, 64> WidenedMask;
28020   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28021     Mask = std::move(WidenedMask);
28022   }
28023
28024   // Canonicalization of binary shuffle masks to improve pattern matching by
28025   // commuting the inputs.
28026   if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28027     ShuffleVectorSDNode::commuteMask(Mask);
28028     std::swap(Ops[0], Ops[1]);
28029   }
28030
28031   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28032                                 DCI, Subtarget);
28033 }
28034
28035 /// \brief Get the PSHUF-style mask from PSHUF node.
28036 ///
28037 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28038 /// PSHUF-style masks that can be reused with such instructions.
28039 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28040   MVT VT = N.getSimpleValueType();
28041   SmallVector<int, 4> Mask;
28042   SmallVector<SDValue, 2> Ops;
28043   bool IsUnary;
28044   bool HaveMask =
28045       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28046   (void)HaveMask;
28047   assert(HaveMask);
28048
28049   // If we have more than 128-bits, only the low 128-bits of shuffle mask
28050   // matter. Check that the upper masks are repeats and remove them.
28051   if (VT.getSizeInBits() > 128) {
28052     int LaneElts = 128 / VT.getScalarSizeInBits();
28053 #ifndef NDEBUG
28054     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28055       for (int j = 0; j < LaneElts; ++j)
28056         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28057                "Mask doesn't repeat in high 128-bit lanes!");
28058 #endif
28059     Mask.resize(LaneElts);
28060   }
28061
28062   switch (N.getOpcode()) {
28063   case X86ISD::PSHUFD:
28064     return Mask;
28065   case X86ISD::PSHUFLW:
28066     Mask.resize(4);
28067     return Mask;
28068   case X86ISD::PSHUFHW:
28069     Mask.erase(Mask.begin(), Mask.begin() + 4);
28070     for (int &M : Mask)
28071       M -= 4;
28072     return Mask;
28073   default:
28074     llvm_unreachable("No valid shuffle instruction found!");
28075   }
28076 }
28077
28078 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28079 ///
28080 /// We walk up the chain and look for a combinable shuffle, skipping over
28081 /// shuffles that we could hoist this shuffle's transformation past without
28082 /// altering anything.
28083 static SDValue
28084 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28085                              SelectionDAG &DAG) {
28086   assert(N.getOpcode() == X86ISD::PSHUFD &&
28087          "Called with something other than an x86 128-bit half shuffle!");
28088   SDLoc DL(N);
28089
28090   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28091   // of the shuffles in the chain so that we can form a fresh chain to replace
28092   // this one.
28093   SmallVector<SDValue, 8> Chain;
28094   SDValue V = N.getOperand(0);
28095   for (; V.hasOneUse(); V = V.getOperand(0)) {
28096     switch (V.getOpcode()) {
28097     default:
28098       return SDValue(); // Nothing combined!
28099
28100     case ISD::BITCAST:
28101       // Skip bitcasts as we always know the type for the target specific
28102       // instructions.
28103       continue;
28104
28105     case X86ISD::PSHUFD:
28106       // Found another dword shuffle.
28107       break;
28108
28109     case X86ISD::PSHUFLW:
28110       // Check that the low words (being shuffled) are the identity in the
28111       // dword shuffle, and the high words are self-contained.
28112       if (Mask[0] != 0 || Mask[1] != 1 ||
28113           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28114         return SDValue();
28115
28116       Chain.push_back(V);
28117       continue;
28118
28119     case X86ISD::PSHUFHW:
28120       // Check that the high words (being shuffled) are the identity in the
28121       // dword shuffle, and the low words are self-contained.
28122       if (Mask[2] != 2 || Mask[3] != 3 ||
28123           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28124         return SDValue();
28125
28126       Chain.push_back(V);
28127       continue;
28128
28129     case X86ISD::UNPCKL:
28130     case X86ISD::UNPCKH:
28131       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28132       // shuffle into a preceding word shuffle.
28133       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28134           V.getSimpleValueType().getVectorElementType() != MVT::i16)
28135         return SDValue();
28136
28137       // Search for a half-shuffle which we can combine with.
28138       unsigned CombineOp =
28139           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28140       if (V.getOperand(0) != V.getOperand(1) ||
28141           !V->isOnlyUserOf(V.getOperand(0).getNode()))
28142         return SDValue();
28143       Chain.push_back(V);
28144       V = V.getOperand(0);
28145       do {
28146         switch (V.getOpcode()) {
28147         default:
28148           return SDValue(); // Nothing to combine.
28149
28150         case X86ISD::PSHUFLW:
28151         case X86ISD::PSHUFHW:
28152           if (V.getOpcode() == CombineOp)
28153             break;
28154
28155           Chain.push_back(V);
28156
28157           LLVM_FALLTHROUGH;
28158         case ISD::BITCAST:
28159           V = V.getOperand(0);
28160           continue;
28161         }
28162         break;
28163       } while (V.hasOneUse());
28164       break;
28165     }
28166     // Break out of the loop if we break out of the switch.
28167     break;
28168   }
28169
28170   if (!V.hasOneUse())
28171     // We fell out of the loop without finding a viable combining instruction.
28172     return SDValue();
28173
28174   // Merge this node's mask and our incoming mask.
28175   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28176   for (int &M : Mask)
28177     M = VMask[M];
28178   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28179                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28180
28181   // Rebuild the chain around this new shuffle.
28182   while (!Chain.empty()) {
28183     SDValue W = Chain.pop_back_val();
28184
28185     if (V.getValueType() != W.getOperand(0).getValueType())
28186       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28187
28188     switch (W.getOpcode()) {
28189     default:
28190       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28191
28192     case X86ISD::UNPCKL:
28193     case X86ISD::UNPCKH:
28194       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28195       break;
28196
28197     case X86ISD::PSHUFD:
28198     case X86ISD::PSHUFLW:
28199     case X86ISD::PSHUFHW:
28200       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28201       break;
28202     }
28203   }
28204   if (V.getValueType() != N.getValueType())
28205     V = DAG.getBitcast(N.getValueType(), V);
28206
28207   // Return the new chain to replace N.
28208   return V;
28209 }
28210
28211 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28212 /// pshufhw.
28213 ///
28214 /// We walk up the chain, skipping shuffles of the other half and looking
28215 /// through shuffles which switch halves trying to find a shuffle of the same
28216 /// pair of dwords.
28217 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28218                                         SelectionDAG &DAG,
28219                                         TargetLowering::DAGCombinerInfo &DCI) {
28220   assert(
28221       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28222       "Called with something other than an x86 128-bit half shuffle!");
28223   SDLoc DL(N);
28224   unsigned CombineOpcode = N.getOpcode();
28225
28226   // Walk up a single-use chain looking for a combinable shuffle.
28227   SDValue V = N.getOperand(0);
28228   for (; V.hasOneUse(); V = V.getOperand(0)) {
28229     switch (V.getOpcode()) {
28230     default:
28231       return false; // Nothing combined!
28232
28233     case ISD::BITCAST:
28234       // Skip bitcasts as we always know the type for the target specific
28235       // instructions.
28236       continue;
28237
28238     case X86ISD::PSHUFLW:
28239     case X86ISD::PSHUFHW:
28240       if (V.getOpcode() == CombineOpcode)
28241         break;
28242
28243       // Other-half shuffles are no-ops.
28244       continue;
28245     }
28246     // Break out of the loop if we break out of the switch.
28247     break;
28248   }
28249
28250   if (!V.hasOneUse())
28251     // We fell out of the loop without finding a viable combining instruction.
28252     return false;
28253
28254   // Combine away the bottom node as its shuffle will be accumulated into
28255   // a preceding shuffle.
28256   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28257
28258   // Record the old value.
28259   SDValue Old = V;
28260
28261   // Merge this node's mask and our incoming mask (adjusted to account for all
28262   // the pshufd instructions encountered).
28263   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28264   for (int &M : Mask)
28265     M = VMask[M];
28266   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28267                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28268
28269   // Check that the shuffles didn't cancel each other out. If not, we need to
28270   // combine to the new one.
28271   if (Old != V)
28272     // Replace the combinable shuffle with the combined one, updating all users
28273     // so that we re-evaluate the chain here.
28274     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28275
28276   return true;
28277 }
28278
28279 /// \brief Try to combine x86 target specific shuffles.
28280 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28281                                     TargetLowering::DAGCombinerInfo &DCI,
28282                                     const X86Subtarget &Subtarget) {
28283   SDLoc DL(N);
28284   MVT VT = N.getSimpleValueType();
28285   SmallVector<int, 4> Mask;
28286
28287   unsigned Opcode = N.getOpcode();
28288   switch (Opcode) {
28289   case X86ISD::PSHUFD:
28290   case X86ISD::PSHUFLW:
28291   case X86ISD::PSHUFHW:
28292     Mask = getPSHUFShuffleMask(N);
28293     assert(Mask.size() == 4);
28294     break;
28295   case X86ISD::UNPCKL: {
28296     auto Op0 = N.getOperand(0);
28297     auto Op1 = N.getOperand(1);
28298     unsigned Opcode0 = Op0.getOpcode();
28299     unsigned Opcode1 = Op1.getOpcode();
28300
28301     // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28302     // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28303     // TODO: Add other horizontal operations as required.
28304     if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28305       return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28306
28307     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28308     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28309     // moves upper half elements into the lower half part. For example:
28310     //
28311     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28312     //     undef:v16i8
28313     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28314     //
28315     // will be combined to:
28316     //
28317     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28318
28319     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28320     // happen due to advanced instructions.
28321     if (!VT.is128BitVector())
28322       return SDValue();
28323
28324     if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28325       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28326
28327       unsigned NumElts = VT.getVectorNumElements();
28328       SmallVector<int, 8> ExpectedMask(NumElts, -1);
28329       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28330                 NumElts / 2);
28331
28332       auto ShufOp = Op1.getOperand(0);
28333       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28334         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28335     }
28336     return SDValue();
28337   }
28338   case X86ISD::BLENDI: {
28339     SDValue V0 = N->getOperand(0);
28340     SDValue V1 = N->getOperand(1);
28341     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28342            "Unexpected input vector types");
28343
28344     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28345     // operands and changing the mask to 1. This saves us a bunch of
28346     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28347     // x86InstrInfo knows how to commute this back after instruction selection
28348     // if it would help register allocation.
28349
28350     // TODO: If optimizing for size or a processor that doesn't suffer from
28351     // partial register update stalls, this should be transformed into a MOVSD
28352     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28353
28354     if (VT == MVT::v2f64)
28355       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28356         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28357           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28358           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28359         }
28360
28361     return SDValue();
28362   }
28363   case X86ISD::MOVSD:
28364   case X86ISD::MOVSS: {
28365     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28366     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28367     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28368     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28369     if (isZero0 && isZero1)
28370       return SDValue();
28371
28372     // We often lower to MOVSD/MOVSS from integer as well as native float
28373     // types; remove unnecessary domain-crossing bitcasts if we can to make it
28374     // easier to combine shuffles later on. We've already accounted for the
28375     // domain switching cost when we decided to lower with it.
28376     bool isFloat = VT.isFloatingPoint();
28377     bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28378     bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28379     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28380       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28381                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28382       V0 = DAG.getBitcast(NewVT, V0);
28383       V1 = DAG.getBitcast(NewVT, V1);
28384       return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28385     }
28386
28387     return SDValue();
28388   }
28389   case X86ISD::INSERTPS: {
28390     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28391     SDValue Op0 = N.getOperand(0);
28392     SDValue Op1 = N.getOperand(1);
28393     SDValue Op2 = N.getOperand(2);
28394     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28395     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28396     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28397     unsigned ZeroMask = InsertPSMask & 0xF;
28398
28399     // If we zero out all elements from Op0 then we don't need to reference it.
28400     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28401       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28402                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28403
28404     // If we zero out the element from Op1 then we don't need to reference it.
28405     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28406       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28407                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28408
28409     // Attempt to merge insertps Op1 with an inner target shuffle node.
28410     SmallVector<int, 8> TargetMask1;
28411     SmallVector<SDValue, 2> Ops1;
28412     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28413       int M = TargetMask1[SrcIdx];
28414       if (isUndefOrZero(M)) {
28415         // Zero/UNDEF insertion - zero out element and remove dependency.
28416         InsertPSMask |= (1u << DstIdx);
28417         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28418                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
28419       }
28420       // Update insertps mask srcidx and reference the source input directly.
28421       assert(0 <= M && M < 8 && "Shuffle index out of range");
28422       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28423       Op1 = Ops1[M < 4 ? 0 : 1];
28424       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28425                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28426     }
28427
28428     // Attempt to merge insertps Op0 with an inner target shuffle node.
28429     SmallVector<int, 8> TargetMask0;
28430     SmallVector<SDValue, 2> Ops0;
28431     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28432       return SDValue();
28433
28434     bool Updated = false;
28435     bool UseInput00 = false;
28436     bool UseInput01 = false;
28437     for (int i = 0; i != 4; ++i) {
28438       int M = TargetMask0[i];
28439       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28440         // No change if element is already zero or the inserted element.
28441         continue;
28442       } else if (isUndefOrZero(M)) {
28443         // If the target mask is undef/zero then we must zero the element.
28444         InsertPSMask |= (1u << i);
28445         Updated = true;
28446         continue;
28447       }
28448
28449       // The input vector element must be inline.
28450       if (M != i && M != (i + 4))
28451         return SDValue();
28452
28453       // Determine which inputs of the target shuffle we're using.
28454       UseInput00 |= (0 <= M && M < 4);
28455       UseInput01 |= (4 <= M);
28456     }
28457
28458     // If we're not using both inputs of the target shuffle then use the
28459     // referenced input directly.
28460     if (UseInput00 && !UseInput01) {
28461       Updated = true;
28462       Op0 = Ops0[0];
28463     } else if (!UseInput00 && UseInput01) {
28464       Updated = true;
28465       Op0 = Ops0[1];
28466     }
28467
28468     if (Updated)
28469       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28470                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28471
28472     return SDValue();
28473   }
28474   default:
28475     return SDValue();
28476   }
28477
28478   // Nuke no-op shuffles that show up after combining.
28479   if (isNoopShuffleMask(Mask))
28480     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28481
28482   // Look for simplifications involving one or two shuffle instructions.
28483   SDValue V = N.getOperand(0);
28484   switch (N.getOpcode()) {
28485   default:
28486     break;
28487   case X86ISD::PSHUFLW:
28488   case X86ISD::PSHUFHW:
28489     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28490
28491     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28492       return SDValue(); // We combined away this shuffle, so we're done.
28493
28494     // See if this reduces to a PSHUFD which is no more expensive and can
28495     // combine with more operations. Note that it has to at least flip the
28496     // dwords as otherwise it would have been removed as a no-op.
28497     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28498       int DMask[] = {0, 1, 2, 3};
28499       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28500       DMask[DOffset + 0] = DOffset + 1;
28501       DMask[DOffset + 1] = DOffset + 0;
28502       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28503       V = DAG.getBitcast(DVT, V);
28504       DCI.AddToWorklist(V.getNode());
28505       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28506                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28507       DCI.AddToWorklist(V.getNode());
28508       return DAG.getBitcast(VT, V);
28509     }
28510
28511     // Look for shuffle patterns which can be implemented as a single unpack.
28512     // FIXME: This doesn't handle the location of the PSHUFD generically, and
28513     // only works when we have a PSHUFD followed by two half-shuffles.
28514     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28515         (V.getOpcode() == X86ISD::PSHUFLW ||
28516          V.getOpcode() == X86ISD::PSHUFHW) &&
28517         V.getOpcode() != N.getOpcode() &&
28518         V.hasOneUse()) {
28519       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28520       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28521         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28522         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28523         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28524         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28525         int WordMask[8];
28526         for (int i = 0; i < 4; ++i) {
28527           WordMask[i + NOffset] = Mask[i] + NOffset;
28528           WordMask[i + VOffset] = VMask[i] + VOffset;
28529         }
28530         // Map the word mask through the DWord mask.
28531         int MappedMask[8];
28532         for (int i = 0; i < 8; ++i)
28533           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28534         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28535             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28536           // We can replace all three shuffles with an unpack.
28537           V = DAG.getBitcast(VT, D.getOperand(0));
28538           DCI.AddToWorklist(V.getNode());
28539           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28540                                                 : X86ISD::UNPCKH,
28541                              DL, VT, V, V);
28542         }
28543       }
28544     }
28545
28546     break;
28547
28548   case X86ISD::PSHUFD:
28549     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28550       return NewN;
28551
28552     break;
28553   }
28554
28555   return SDValue();
28556 }
28557
28558 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28559 /// operation. If true is returned then the operands of ADDSUB operation
28560 /// are written to the parameters \p Opnd0 and \p Opnd1.
28561 ///
28562 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28563 /// so it is easier to generically match. We also insert dummy vector shuffle
28564 /// nodes for the operands which explicitly discard the lanes which are unused
28565 /// by this operation to try to flow through the rest of the combiner
28566 /// the fact that they're unused.
28567 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28568                      SDValue &Opnd0, SDValue &Opnd1) {
28569
28570   EVT VT = N->getValueType(0);
28571   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28572       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28573       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28574     return false;
28575
28576   // We only handle target-independent shuffles.
28577   // FIXME: It would be easy and harmless to use the target shuffle mask
28578   // extraction tool to support more.
28579   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28580     return false;
28581
28582   ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28583   SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28584
28585   SDValue V1 = N->getOperand(0);
28586   SDValue V2 = N->getOperand(1);
28587
28588   // We require the first shuffle operand to be the FSUB node, and the second to
28589   // be the FADD node.
28590   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28591     ShuffleVectorSDNode::commuteMask(Mask);
28592     std::swap(V1, V2);
28593   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28594     return false;
28595
28596   // If there are other uses of these operations we can't fold them.
28597   if (!V1->hasOneUse() || !V2->hasOneUse())
28598     return false;
28599
28600   // Ensure that both operations have the same operands. Note that we can
28601   // commute the FADD operands.
28602   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28603   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28604       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28605     return false;
28606
28607   // We're looking for blends between FADD and FSUB nodes. We insist on these
28608   // nodes being lined up in a specific expected pattern.
28609   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28610         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28611         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28612         isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28613                                            8, 25, 10, 27, 12, 29, 14, 31})))
28614     return false;
28615
28616   Opnd0 = LHS;
28617   Opnd1 = RHS;
28618   return true;
28619 }
28620
28621 /// \brief Try to combine a shuffle into a target-specific add-sub or
28622 /// mul-add-sub node.
28623 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28624                                                 const X86Subtarget &Subtarget,
28625                                                 SelectionDAG &DAG) {
28626   SDValue Opnd0, Opnd1;
28627   if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28628     return SDValue();
28629
28630   EVT VT = N->getValueType(0);
28631   SDLoc DL(N);
28632
28633   // Try to generate X86ISD::FMADDSUB node here.
28634   SDValue Opnd2;
28635   if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28636     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28637
28638   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28639   // the ADDSUB idiom has been successfully recognized. There are no known
28640   // X86 targets with 512-bit ADDSUB instructions!
28641   if (VT.is512BitVector())
28642     return SDValue();
28643
28644   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28645 }
28646
28647 // We are looking for a shuffle where both sources are concatenated with undef
28648 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28649 // if we can express this as a single-source shuffle, that's preferable.
28650 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28651                                            const X86Subtarget &Subtarget) {
28652   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28653     return SDValue();
28654
28655   EVT VT = N->getValueType(0);
28656
28657   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28658   if (!VT.is128BitVector() && !VT.is256BitVector())
28659     return SDValue();
28660
28661   if (VT.getVectorElementType() != MVT::i32 &&
28662       VT.getVectorElementType() != MVT::i64 &&
28663       VT.getVectorElementType() != MVT::f32 &&
28664       VT.getVectorElementType() != MVT::f64)
28665     return SDValue();
28666
28667   SDValue N0 = N->getOperand(0);
28668   SDValue N1 = N->getOperand(1);
28669
28670   // Check that both sources are concats with undef.
28671   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28672       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28673       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28674       !N1.getOperand(1).isUndef())
28675     return SDValue();
28676
28677   // Construct the new shuffle mask. Elements from the first source retain their
28678   // index, but elements from the second source no longer need to skip an undef.
28679   SmallVector<int, 8> Mask;
28680   int NumElts = VT.getVectorNumElements();
28681
28682   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28683   for (int Elt : SVOp->getMask())
28684     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28685
28686   SDLoc DL(N);
28687   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28688                                N1.getOperand(0));
28689   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28690 }
28691
28692 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28693                               TargetLowering::DAGCombinerInfo &DCI,
28694                               const X86Subtarget &Subtarget) {
28695   SDLoc dl(N);
28696   EVT VT = N->getValueType(0);
28697   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28698   // If we have legalized the vector types, look for blends of FADD and FSUB
28699   // nodes that we can fuse into an ADDSUB node.
28700   if (TLI.isTypeLegal(VT))
28701     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28702       return AddSub;
28703
28704   // During Type Legalization, when promoting illegal vector types,
28705   // the backend might introduce new shuffle dag nodes and bitcasts.
28706   //
28707   // This code performs the following transformation:
28708   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28709   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28710   //
28711   // We do this only if both the bitcast and the BINOP dag nodes have
28712   // one use. Also, perform this transformation only if the new binary
28713   // operation is legal. This is to avoid introducing dag nodes that
28714   // potentially need to be further expanded (or custom lowered) into a
28715   // less optimal sequence of dag nodes.
28716   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28717       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28718       N->getOperand(0).getOpcode() == ISD::BITCAST &&
28719       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28720     SDValue N0 = N->getOperand(0);
28721     SDValue N1 = N->getOperand(1);
28722
28723     SDValue BC0 = N0.getOperand(0);
28724     EVT SVT = BC0.getValueType();
28725     unsigned Opcode = BC0.getOpcode();
28726     unsigned NumElts = VT.getVectorNumElements();
28727
28728     if (BC0.hasOneUse() && SVT.isVector() &&
28729         SVT.getVectorNumElements() * 2 == NumElts &&
28730         TLI.isOperationLegal(Opcode, VT)) {
28731       bool CanFold = false;
28732       switch (Opcode) {
28733       default : break;
28734       case ISD::ADD:
28735       case ISD::SUB:
28736       case ISD::MUL:
28737         // isOperationLegal lies for integer ops on floating point types.
28738         CanFold = VT.isInteger();
28739         break;
28740       case ISD::FADD:
28741       case ISD::FSUB:
28742       case ISD::FMUL:
28743         // isOperationLegal lies for floating point ops on integer types.
28744         CanFold = VT.isFloatingPoint();
28745         break;
28746       }
28747
28748       unsigned SVTNumElts = SVT.getVectorNumElements();
28749       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28750       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28751         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28752       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28753         CanFold = SVOp->getMaskElt(i) < 0;
28754
28755       if (CanFold) {
28756         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28757         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28758         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28759         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28760       }
28761     }
28762   }
28763
28764   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28765   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28766   // consecutive, non-overlapping, and in the right order.
28767   SmallVector<SDValue, 16> Elts;
28768   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
28769     if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
28770       Elts.push_back(Elt);
28771       continue;
28772     }
28773     Elts.clear();
28774     break;
28775   }
28776
28777   if (Elts.size() == VT.getVectorNumElements())
28778     if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28779       return LD;
28780
28781   // For AVX2, we sometimes want to combine
28782   // (vector_shuffle <mask> (concat_vectors t1, undef)
28783   //                        (concat_vectors t2, undef))
28784   // Into:
28785   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28786   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28787   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28788     return ShufConcat;
28789
28790   if (isTargetShuffle(N->getOpcode())) {
28791     SDValue Op(N, 0);
28792     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28793       return Shuffle;
28794
28795     // Try recursively combining arbitrary sequences of x86 shuffle
28796     // instructions into higher-order shuffles. We do this after combining
28797     // specific PSHUF instruction sequences into their minimal form so that we
28798     // can evaluate how many specialized shuffle instructions are involved in
28799     // a particular chain.
28800     SmallVector<int, 1> NonceMask; // Just a placeholder.
28801     NonceMask.push_back(0);
28802     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
28803                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28804                                       DCI, Subtarget))
28805       return SDValue(); // This routine will use CombineTo to replace N.
28806   }
28807
28808   return SDValue();
28809 }
28810
28811 /// Check if a vector extract from a target-specific shuffle of a load can be
28812 /// folded into a single element load.
28813 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28814 /// shuffles have been custom lowered so we need to handle those here.
28815 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28816                                          TargetLowering::DAGCombinerInfo &DCI) {
28817   if (DCI.isBeforeLegalizeOps())
28818     return SDValue();
28819
28820   SDValue InVec = N->getOperand(0);
28821   SDValue EltNo = N->getOperand(1);
28822   EVT EltVT = N->getValueType(0);
28823
28824   if (!isa<ConstantSDNode>(EltNo))
28825     return SDValue();
28826
28827   EVT OriginalVT = InVec.getValueType();
28828
28829   // Peek through bitcasts, don't duplicate a load with other uses.
28830   InVec = peekThroughOneUseBitcasts(InVec);
28831
28832   EVT CurrentVT = InVec.getValueType();
28833   if (!CurrentVT.isVector() ||
28834       CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28835     return SDValue();
28836
28837   if (!isTargetShuffle(InVec.getOpcode()))
28838     return SDValue();
28839
28840   // Don't duplicate a load with other uses.
28841   if (!InVec.hasOneUse())
28842     return SDValue();
28843
28844   SmallVector<int, 16> ShuffleMask;
28845   SmallVector<SDValue, 2> ShuffleOps;
28846   bool UnaryShuffle;
28847   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28848                             ShuffleOps, ShuffleMask, UnaryShuffle))
28849     return SDValue();
28850
28851   // Select the input vector, guarding against out of range extract vector.
28852   unsigned NumElems = CurrentVT.getVectorNumElements();
28853   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28854   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28855
28856   if (Idx == SM_SentinelZero)
28857     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28858                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28859   if (Idx == SM_SentinelUndef)
28860     return DAG.getUNDEF(EltVT);
28861
28862   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28863   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28864                                          : ShuffleOps[1];
28865
28866   // If inputs to shuffle are the same for both ops, then allow 2 uses
28867   unsigned AllowedUses =
28868       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28869
28870   if (LdNode.getOpcode() == ISD::BITCAST) {
28871     // Don't duplicate a load with other uses.
28872     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28873       return SDValue();
28874
28875     AllowedUses = 1; // only allow 1 load use if we have a bitcast
28876     LdNode = LdNode.getOperand(0);
28877   }
28878
28879   if (!ISD::isNormalLoad(LdNode.getNode()))
28880     return SDValue();
28881
28882   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28883
28884   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28885     return SDValue();
28886
28887   // If there's a bitcast before the shuffle, check if the load type and
28888   // alignment is valid.
28889   unsigned Align = LN0->getAlignment();
28890   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28891   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28892       EltVT.getTypeForEVT(*DAG.getContext()));
28893
28894   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28895     return SDValue();
28896
28897   // All checks match so transform back to vector_shuffle so that DAG combiner
28898   // can finish the job
28899   SDLoc dl(N);
28900
28901   // Create shuffle node taking into account the case that its a unary shuffle
28902   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28903   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28904                                  ShuffleMask);
28905   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28906   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28907                      EltNo);
28908 }
28909
28910 // Try to match patterns such as
28911 // (i16 bitcast (v16i1 x))
28912 // ->
28913 // (i16 movmsk (16i8 sext (v16i1 x)))
28914 // before the illegal vector is scalarized on subtargets that don't have legal
28915 // vxi1 types.
28916 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
28917                                   const X86Subtarget &Subtarget) {
28918   EVT VT = BitCast.getValueType();
28919   SDValue N0 = BitCast.getOperand(0);
28920   EVT VecVT = N0->getValueType(0);
28921
28922   if (!VT.isScalarInteger() || !VecVT.isSimple())
28923     return SDValue();
28924
28925   // With AVX512 vxi1 types are legal and we prefer using k-regs.
28926   // MOVMSK is supported in SSE2 or later.
28927   if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
28928     return SDValue();
28929
28930   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
28931   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
28932   // v8i16 and v16i16.
28933   // For these two cases, we can shuffle the upper element bytes to a
28934   // consecutive sequence at the start of the vector and treat the results as
28935   // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However,
28936   // for v16i16 this is not the case, because the shuffle is expensive, so we
28937   // avoid sign-exteding to this type entirely.
28938   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
28939   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
28940   MVT SExtVT;
28941   MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
28942   switch (VecVT.getSimpleVT().SimpleTy) {
28943   default:
28944     return SDValue();
28945   case MVT::v2i1:
28946     SExtVT = MVT::v2i64;
28947     FPCastVT = MVT::v2f64;
28948     break;
28949   case MVT::v4i1:
28950     SExtVT = MVT::v4i32;
28951     FPCastVT = MVT::v4f32;
28952     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
28953     // sign-extend to a 256-bit operation to avoid truncation.
28954     if (N0->getOpcode() == ISD::SETCC &&
28955         N0->getOperand(0)->getValueType(0).is256BitVector() &&
28956         Subtarget.hasInt256()) {
28957       SExtVT = MVT::v4i64;
28958       FPCastVT = MVT::v4f64;
28959     }
28960     break;
28961   case MVT::v8i1:
28962     SExtVT = MVT::v8i16;
28963     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
28964     // sign-extend to a 256-bit operation to match the compare.
28965     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
28966     // 256-bit because the shuffle is cheaper than sign extending the result of
28967     // the compare.
28968     if (N0->getOpcode() == ISD::SETCC &&
28969         N0->getOperand(0)->getValueType(0).is256BitVector() &&
28970         Subtarget.hasInt256()) {
28971       SExtVT = MVT::v8i32;
28972       FPCastVT = MVT::v8f32;
28973     }
28974     break;
28975   case MVT::v16i1:
28976     SExtVT = MVT::v16i8;
28977     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
28978     // it is not profitable to sign-extend to 256-bit because this will
28979     // require an extra cross-lane shuffle which is more exprensive than
28980     // truncating the result of the compare to 128-bits.
28981     break;
28982   case MVT::v32i1:
28983     // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
28984     if (!Subtarget.hasInt256())
28985       return SDValue();
28986     SExtVT = MVT::v32i8;
28987     break;
28988   };
28989
28990   SDLoc DL(BitCast);
28991   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
28992   if (SExtVT == MVT::v8i16) {
28993     V = DAG.getBitcast(MVT::v16i8, V);
28994     V = DAG.getVectorShuffle(
28995         MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
28996         {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
28997   } else
28998     assert(SExtVT.getScalarType() != MVT::i16 &&
28999            "Vectors of i16 must be shuffled");
29000   if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29001     V = DAG.getBitcast(FPCastVT, V);
29002   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29003   return DAG.getZExtOrTrunc(V, DL, VT);
29004 }
29005
29006 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29007                               TargetLowering::DAGCombinerInfo &DCI,
29008                               const X86Subtarget &Subtarget) {
29009   SDValue N0 = N->getOperand(0);
29010   EVT VT = N->getValueType(0);
29011   EVT SrcVT = N0.getValueType();
29012
29013   // Try to match patterns such as
29014   // (i16 bitcast (v16i1 x))
29015   // ->
29016   // (i16 movmsk (16i8 sext (v16i1 x)))
29017   // before the setcc result is scalarized on subtargets that don't have legal
29018   // vxi1 types.
29019   if (DCI.isBeforeLegalize())
29020     if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29021       return V;
29022   // Since MMX types are special and don't usually play with other vector types,
29023   // it's better to handle them early to be sure we emit efficient code by
29024   // avoiding store-load conversions.
29025
29026   // Detect bitcasts between i32 to x86mmx low word.
29027   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29028       SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29029     SDValue N00 = N0->getOperand(0);
29030     if (N00.getValueType() == MVT::i32)
29031       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29032   }
29033
29034   // Detect bitcasts between element or subvector extraction to x86mmx.
29035   if (VT == MVT::x86mmx &&
29036       (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29037        N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29038       isNullConstant(N0.getOperand(1))) {
29039     SDValue N00 = N0->getOperand(0);
29040     if (N00.getValueType().is128BitVector())
29041       return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29042                          DAG.getBitcast(MVT::v2i64, N00));
29043   }
29044
29045   // Detect bitcasts from FP_TO_SINT to x86mmx.
29046   if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29047       N0.getOpcode() == ISD::FP_TO_SINT) {
29048     SDLoc DL(N0);
29049     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29050                               DAG.getUNDEF(MVT::v2i32));
29051     return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29052                        DAG.getBitcast(MVT::v2i64, Res));
29053   }
29054
29055   // Convert a bitcasted integer logic operation that has one bitcasted
29056   // floating-point operand into a floating-point logic operation. This may
29057   // create a load of a constant, but that is cheaper than materializing the
29058   // constant in an integer register and transferring it to an SSE register or
29059   // transferring the SSE operand to integer register and back.
29060   unsigned FPOpcode;
29061   switch (N0.getOpcode()) {
29062     case ISD::AND: FPOpcode = X86ISD::FAND; break;
29063     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
29064     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29065     default: return SDValue();
29066   }
29067
29068   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29069         (Subtarget.hasSSE2() && VT == MVT::f64)))
29070     return SDValue();
29071
29072   SDValue LogicOp0 = N0.getOperand(0);
29073   SDValue LogicOp1 = N0.getOperand(1);
29074   SDLoc DL0(N0);
29075
29076   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29077   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29078       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29079       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29080     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29081     return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29082   }
29083   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29084   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29085       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29086       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29087     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29088     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29089   }
29090
29091   return SDValue();
29092 }
29093
29094 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29095 // the elements of a vector.
29096 // Returns the vector that is being reduced on, or SDValue() if a reduction
29097 // was not matched.
29098 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29099   // The pattern must end in an extract from index 0.
29100   if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29101       !isNullConstant(Extract->getOperand(1)))
29102     return SDValue();
29103
29104   unsigned Stages =
29105       Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29106
29107   SDValue Op = Extract->getOperand(0);
29108   // At each stage, we're looking for something that looks like:
29109   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29110   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29111   //                               i32 undef, i32 undef, i32 undef, i32 undef>
29112   // %a = binop <8 x i32> %op, %s
29113   // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29114   // we expect something like:
29115   // <4,5,6,7,u,u,u,u>
29116   // <2,3,u,u,u,u,u,u>
29117   // <1,u,u,u,u,u,u,u>
29118   for (unsigned i = 0; i < Stages; ++i) {
29119     if (Op.getOpcode() != BinOp)
29120       return SDValue();
29121
29122     ShuffleVectorSDNode *Shuffle =
29123         dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29124     if (Shuffle) {
29125       Op = Op.getOperand(1);
29126     } else {
29127       Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29128       Op = Op.getOperand(0);
29129     }
29130
29131     // The first operand of the shuffle should be the same as the other operand
29132     // of the add.
29133     if (!Shuffle || (Shuffle->getOperand(0) != Op))
29134       return SDValue();
29135
29136     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29137     for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29138       if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29139         return SDValue();
29140   }
29141
29142   return Op;
29143 }
29144
29145 // Given a select, detect the following pattern:
29146 // 1:    %2 = zext <N x i8> %0 to <N x i32>
29147 // 2:    %3 = zext <N x i8> %1 to <N x i32>
29148 // 3:    %4 = sub nsw <N x i32> %2, %3
29149 // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29150 // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
29151 // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29152 // This is useful as it is the input into a SAD pattern.
29153 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29154                               SDValue &Op1) {
29155   // Check the condition of the select instruction is greater-than.
29156   SDValue SetCC = Select->getOperand(0);
29157   if (SetCC.getOpcode() != ISD::SETCC)
29158     return false;
29159   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29160   if (CC != ISD::SETGT && CC != ISD::SETLT)
29161     return false;
29162
29163   SDValue SelectOp1 = Select->getOperand(1);
29164   SDValue SelectOp2 = Select->getOperand(2);
29165
29166   // The following instructions assume SelectOp1 is the subtraction operand
29167   // and SelectOp2 is the negation operand.
29168   // In the case of SETLT this is the other way around.
29169   if (CC == ISD::SETLT)
29170     std::swap(SelectOp1, SelectOp2);
29171
29172   // The second operand of the select should be the negation of the first
29173   // operand, which is implemented as 0 - SelectOp1.
29174   if (!(SelectOp2.getOpcode() == ISD::SUB &&
29175         ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29176         SelectOp2.getOperand(1) == SelectOp1))
29177     return false;
29178
29179   // The first operand of SetCC is the first operand of the select, which is the
29180   // difference between the two input vectors.
29181   if (SetCC.getOperand(0) != SelectOp1)
29182     return false;
29183
29184   // In SetLT case, The second operand of the comparison can be either 1 or 0.
29185   APInt SplatVal;
29186   if ((CC == ISD::SETLT) &&
29187       !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29188          SplatVal == 1) ||
29189         (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29190     return false;
29191
29192   // In SetGT case, The second operand of the comparison can be either -1 or 0.
29193   if ((CC == ISD::SETGT) &&
29194       !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29195         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29196     return false;
29197
29198   // The first operand of the select is the difference between the two input
29199   // vectors.
29200   if (SelectOp1.getOpcode() != ISD::SUB)
29201     return false;
29202
29203   Op0 = SelectOp1.getOperand(0);
29204   Op1 = SelectOp1.getOperand(1);
29205
29206   // Check if the operands of the sub are zero-extended from vectors of i8.
29207   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29208       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29209       Op1.getOpcode() != ISD::ZERO_EXTEND ||
29210       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29211     return false;
29212
29213   return true;
29214 }
29215
29216 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29217 // to these zexts.
29218 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29219                             const SDValue &Zext1, const SDLoc &DL) {
29220
29221   // Find the appropriate width for the PSADBW.
29222   EVT InVT = Zext0.getOperand(0).getValueType();
29223   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29224
29225   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29226   // fill in the missing vector elements with 0.
29227   unsigned NumConcat = RegSize / InVT.getSizeInBits();
29228   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29229   Ops[0] = Zext0.getOperand(0);
29230   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29231   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29232   Ops[0] = Zext1.getOperand(0);
29233   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29234
29235   // Actually build the SAD
29236   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29237   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29238 }
29239
29240 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29241 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29242                                                 SelectionDAG &DAG,
29243                                                 const X86Subtarget &Subtarget) {
29244   // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29245   if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29246     return SDValue();
29247
29248   EVT ExtractVT = Extract->getValueType(0);
29249   unsigned BitWidth = ExtractVT.getSizeInBits();
29250   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29251       ExtractVT != MVT::i8)
29252     return SDValue();
29253
29254   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29255   for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29256     SDValue Match = matchBinOpReduction(Extract, Op);
29257     if (!Match)
29258       continue;
29259
29260     // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29261     // which we can't support here for now.
29262     if (Match.getScalarValueSizeInBits() != BitWidth)
29263       continue;
29264
29265     // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29266     unsigned MatchSizeInBits = Match.getValueSizeInBits();
29267     if (!(MatchSizeInBits == 128 ||
29268           (MatchSizeInBits == 256 &&
29269            ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29270       return SDValue();
29271
29272     // Don't bother performing this for 2-element vectors.
29273     if (Match.getValueType().getVectorNumElements() <= 2)
29274       return SDValue();
29275
29276     // Check that we are extracting a reduction of all sign bits.
29277     if (DAG.ComputeNumSignBits(Match) != BitWidth)
29278       return SDValue();
29279
29280     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29281     MVT MaskVT;
29282     if (64 == BitWidth || 32 == BitWidth)
29283       MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29284                                 MatchSizeInBits / BitWidth);
29285     else
29286       MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29287
29288     APInt CompareBits;
29289     ISD::CondCode CondCode;
29290     if (Op == ISD::OR) {
29291       // any_of -> MOVMSK != 0
29292       CompareBits = APInt::getNullValue(32);
29293       CondCode = ISD::CondCode::SETNE;
29294     } else {
29295       // all_of -> MOVMSK == ((1 << NumElts) - 1)
29296       CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29297       CondCode = ISD::CondCode::SETEQ;
29298     }
29299
29300     // Perform the select as i32/i64 and then truncate to avoid partial register
29301     // stalls.
29302     unsigned ResWidth = std::max(BitWidth, 32u);
29303     EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29304     SDLoc DL(Extract);
29305     SDValue Zero = DAG.getConstant(0, DL, ResVT);
29306     SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29307     SDValue Res = DAG.getBitcast(MaskVT, Match);
29308     Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29309     Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29310                           Ones, Zero, CondCode);
29311     return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29312   }
29313
29314   return SDValue();
29315 }
29316
29317 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29318                                       const X86Subtarget &Subtarget) {
29319   // PSADBW is only supported on SSE2 and up.
29320   if (!Subtarget.hasSSE2())
29321     return SDValue();
29322
29323   // Verify the type we're extracting from is any integer type above i16.
29324   EVT VT = Extract->getOperand(0).getValueType();
29325   if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29326     return SDValue();
29327
29328   unsigned RegSize = 128;
29329   if (Subtarget.hasBWI())
29330     RegSize = 512;
29331   else if (Subtarget.hasAVX2())
29332     RegSize = 256;
29333
29334   // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29335   // TODO: We should be able to handle larger vectors by splitting them before
29336   // feeding them into several SADs, and then reducing over those.
29337   if (RegSize / VT.getVectorNumElements() < 8)
29338     return SDValue();
29339
29340   // Match shuffle + add pyramid.
29341   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29342
29343   // The operand is expected to be zero extended from i8
29344   // (verified in detectZextAbsDiff).
29345   // In order to convert to i64 and above, additional any/zero/sign
29346   // extend is expected.
29347   // The zero extend from 32 bit has no mathematical effect on the result.
29348   // Also the sign extend is basically zero extend
29349   // (extends the sign bit which is zero).
29350   // So it is correct to skip the sign/zero extend instruction.
29351   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29352           Root.getOpcode() == ISD::ZERO_EXTEND ||
29353           Root.getOpcode() == ISD::ANY_EXTEND))
29354     Root = Root.getOperand(0);
29355
29356   // If there was a match, we want Root to be a select that is the root of an
29357   // abs-diff pattern.
29358   if (!Root || (Root.getOpcode() != ISD::VSELECT))
29359     return SDValue();
29360
29361   // Check whether we have an abs-diff pattern feeding into the select.
29362   SDValue Zext0, Zext1;
29363   if (!detectZextAbsDiff(Root, Zext0, Zext1))
29364     return SDValue();
29365
29366   // Create the SAD instruction.
29367   SDLoc DL(Extract);
29368   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29369
29370   // If the original vector was wider than 8 elements, sum over the results
29371   // in the SAD vector.
29372   unsigned Stages = Log2_32(VT.getVectorNumElements());
29373   MVT SadVT = SAD.getSimpleValueType();
29374   if (Stages > 3) {
29375     unsigned SadElems = SadVT.getVectorNumElements();
29376
29377     for(unsigned i = Stages - 3; i > 0; --i) {
29378       SmallVector<int, 16> Mask(SadElems, -1);
29379       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29380         Mask[j] = MaskEnd + j;
29381
29382       SDValue Shuffle =
29383           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29384       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29385     }
29386   }
29387
29388   MVT Type = Extract->getSimpleValueType(0);
29389   unsigned TypeSizeInBits = Type.getSizeInBits();
29390   // Return the lowest TypeSizeInBits bits.
29391   MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29392   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29393   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29394                      Extract->getOperand(1));
29395 }
29396
29397 // Attempt to peek through a target shuffle and extract the scalar from the
29398 // source.
29399 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29400                                          TargetLowering::DAGCombinerInfo &DCI,
29401                                          const X86Subtarget &Subtarget) {
29402   if (DCI.isBeforeLegalizeOps())
29403     return SDValue();
29404
29405   SDValue Src = N->getOperand(0);
29406   SDValue Idx = N->getOperand(1);
29407
29408   EVT VT = N->getValueType(0);
29409   EVT SrcVT = Src.getValueType();
29410   EVT SrcSVT = SrcVT.getVectorElementType();
29411   unsigned NumSrcElts = SrcVT.getVectorNumElements();
29412
29413   // Don't attempt this for boolean mask vectors or unknown extraction indices.
29414   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29415     return SDValue();
29416
29417   // Resolve the target shuffle inputs and mask.
29418   SmallVector<int, 16> Mask;
29419   SmallVector<SDValue, 2> Ops;
29420   if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
29421     return SDValue();
29422
29423   // Attempt to narrow/widen the shuffle mask to the correct size.
29424   if (Mask.size() != NumSrcElts) {
29425     if ((NumSrcElts % Mask.size()) == 0) {
29426       SmallVector<int, 16> ScaledMask;
29427       int Scale = NumSrcElts / Mask.size();
29428       scaleShuffleMask(Scale, Mask, ScaledMask);
29429       Mask = std::move(ScaledMask);
29430     } else if ((Mask.size() % NumSrcElts) == 0) {
29431       SmallVector<int, 16> WidenedMask;
29432       while (Mask.size() > NumSrcElts &&
29433              canWidenShuffleElements(Mask, WidenedMask))
29434         Mask = std::move(WidenedMask);
29435       // TODO - investigate support for wider shuffle masks with known upper
29436       // undef/zero elements for implicit zero-extension.
29437     }
29438   }
29439
29440   // Check if narrowing/widening failed.
29441   if (Mask.size() != NumSrcElts)
29442     return SDValue();
29443
29444   int SrcIdx = Mask[N->getConstantOperandVal(1)];
29445   SDLoc dl(N);
29446
29447   // If the shuffle source element is undef/zero then we can just accept it.
29448   if (SrcIdx == SM_SentinelUndef)
29449     return DAG.getUNDEF(VT);
29450
29451   if (SrcIdx == SM_SentinelZero)
29452     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29453                                 : DAG.getConstant(0, dl, VT);
29454
29455   SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29456   SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29457   SrcIdx = SrcIdx % Mask.size();
29458
29459   // We can only extract other elements from 128-bit vectors and in certain
29460   // circumstances, depending on SSE-level.
29461   // TODO: Investigate using extract_subvector for larger vectors.
29462   // TODO: Investigate float/double extraction if it will be just stored.
29463   if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29464       ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29465     assert(SrcSVT == VT && "Unexpected extraction type");
29466     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29467                        DAG.getIntPtrConstant(SrcIdx, dl));
29468   }
29469
29470   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29471       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29472     assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29473            "Unexpected extraction type");
29474     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29475     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29476                                 DAG.getIntPtrConstant(SrcIdx, dl));
29477     SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29478                                  DAG.getValueType(SrcSVT));
29479     return DAG.getZExtOrTrunc(Assert, dl, VT);
29480   }
29481
29482   return SDValue();
29483 }
29484
29485 /// Detect vector gather/scatter index generation and convert it from being a
29486 /// bunch of shuffles and extracts into a somewhat faster sequence.
29487 /// For i686, the best sequence is apparently storing the value and loading
29488 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29489 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29490                                        TargetLowering::DAGCombinerInfo &DCI,
29491                                        const X86Subtarget &Subtarget) {
29492   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29493     return NewOp;
29494
29495   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29496     return NewOp;
29497
29498   SDValue InputVector = N->getOperand(0);
29499   SDValue EltIdx = N->getOperand(1);
29500
29501   EVT SrcVT = InputVector.getValueType();
29502   EVT VT = N->getValueType(0);
29503   SDLoc dl(InputVector);
29504
29505   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29506   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29507       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29508     SDValue MMXSrc = InputVector.getOperand(0);
29509
29510     // The bitcast source is a direct mmx result.
29511     if (MMXSrc.getValueType() == MVT::x86mmx)
29512       return DAG.getBitcast(VT, InputVector);
29513   }
29514
29515   // Detect mmx to i32 conversion through a v2i32 elt extract.
29516   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29517       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29518     SDValue MMXSrc = InputVector.getOperand(0);
29519
29520     // The bitcast source is a direct mmx result.
29521     if (MMXSrc.getValueType() == MVT::x86mmx)
29522       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29523   }
29524
29525   if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29526       isa<ConstantSDNode>(EltIdx) &&
29527       isa<ConstantSDNode>(InputVector.getOperand(0))) {
29528     uint64_t ExtractedElt = N->getConstantOperandVal(1);
29529     uint64_t InputValue = InputVector.getConstantOperandVal(0);
29530     uint64_t Res = (InputValue >> ExtractedElt) & 1;
29531     return DAG.getConstant(Res, dl, MVT::i1);
29532   }
29533
29534   // Check whether this extract is the root of a sum of absolute differences
29535   // pattern. This has to be done here because we really want it to happen
29536   // pre-legalization,
29537   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29538     return SAD;
29539
29540   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29541   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29542     return Cmp;
29543
29544   // Only operate on vectors of 4 elements, where the alternative shuffling
29545   // gets to be more expensive.
29546   if (SrcVT != MVT::v4i32)
29547     return SDValue();
29548
29549   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29550   // single use which is a sign-extend or zero-extend, and all elements are
29551   // used.
29552   SmallVector<SDNode *, 4> Uses;
29553   unsigned ExtractedElements = 0;
29554   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29555        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29556     if (UI.getUse().getResNo() != InputVector.getResNo())
29557       return SDValue();
29558
29559     SDNode *Extract = *UI;
29560     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29561       return SDValue();
29562
29563     if (Extract->getValueType(0) != MVT::i32)
29564       return SDValue();
29565     if (!Extract->hasOneUse())
29566       return SDValue();
29567     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29568         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29569       return SDValue();
29570     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29571       return SDValue();
29572
29573     // Record which element was extracted.
29574     ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29575     Uses.push_back(Extract);
29576   }
29577
29578   // If not all the elements were used, this may not be worthwhile.
29579   if (ExtractedElements != 15)
29580     return SDValue();
29581
29582   // Ok, we've now decided to do the transformation.
29583   // If 64-bit shifts are legal, use the extract-shift sequence,
29584   // otherwise bounce the vector off the cache.
29585   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29586   SDValue Vals[4];
29587
29588   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29589     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29590     auto &DL = DAG.getDataLayout();
29591     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29592     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29593       DAG.getConstant(0, dl, VecIdxTy));
29594     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29595       DAG.getConstant(1, dl, VecIdxTy));
29596
29597     SDValue ShAmt = DAG.getConstant(
29598         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29599     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29600     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29601       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29602     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29603     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29604       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29605   } else {
29606     // Store the value to a temporary stack slot.
29607     SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29608     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29609                               MachinePointerInfo());
29610
29611     EVT ElementType = SrcVT.getVectorElementType();
29612     unsigned EltSize = ElementType.getSizeInBits() / 8;
29613
29614     // Replace each use (extract) with a load of the appropriate element.
29615     for (unsigned i = 0; i < 4; ++i) {
29616       uint64_t Offset = EltSize * i;
29617       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29618       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29619
29620       SDValue ScalarAddr =
29621           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29622
29623       // Load the scalar.
29624       Vals[i] =
29625           DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29626     }
29627   }
29628
29629   // Replace the extracts
29630   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29631     UE = Uses.end(); UI != UE; ++UI) {
29632     SDNode *Extract = *UI;
29633
29634     uint64_t IdxVal = Extract->getConstantOperandVal(1);
29635     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29636   }
29637
29638   // The replacement was made in place; don't return anything.
29639   return SDValue();
29640 }
29641
29642 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29643 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29644 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29645 // combineBasicSADPattern.
29646 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29647                                            TargetLowering::DAGCombinerInfo &DCI,
29648                                            const X86Subtarget &Subtarget) {
29649   return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29650 }
29651
29652 /// If a vector select has an operand that is -1 or 0, try to simplify the
29653 /// select to a bitwise logic operation.
29654 static SDValue
29655 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29656                                  TargetLowering::DAGCombinerInfo &DCI,
29657                                  const X86Subtarget &Subtarget) {
29658   SDValue Cond = N->getOperand(0);
29659   SDValue LHS = N->getOperand(1);
29660   SDValue RHS = N->getOperand(2);
29661   EVT VT = LHS.getValueType();
29662   EVT CondVT = Cond.getValueType();
29663   SDLoc DL(N);
29664   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29665
29666   if (N->getOpcode() != ISD::VSELECT)
29667     return SDValue();
29668
29669   assert(CondVT.isVector() && "Vector select expects a vector selector!");
29670
29671   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29672   // Check if the first operand is all zeros and Cond type is vXi1.
29673   // This situation only applies to avx512.
29674   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
29675       CondVT.getVectorElementType() == MVT::i1) {
29676     // Invert the cond to not(cond) : xor(op,allones)=not(op)
29677     SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
29678                                   DAG.getAllOnesConstant(DL, CondVT));
29679     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
29680     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
29681   }
29682
29683   // To use the condition operand as a bitwise mask, it must have elements that
29684   // are the same size as the select elements. Ie, the condition operand must
29685   // have already been promoted from the IR select condition type <N x i1>.
29686   // Don't check if the types themselves are equal because that excludes
29687   // vector floating-point selects.
29688   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
29689     return SDValue();
29690
29691   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
29692   FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
29693
29694   // Try to invert the condition if true value is not all 1s and false value is
29695   // not all 0s.
29696   if (!TValIsAllOnes && !FValIsAllZeros &&
29697       // Check if the selector will be produced by CMPP*/PCMP*.
29698       Cond.getOpcode() == ISD::SETCC &&
29699       // Check if SETCC has already been promoted.
29700       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
29701           CondVT) {
29702     bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
29703     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
29704
29705     if (TValIsAllZeros || FValIsAllOnes) {
29706       SDValue CC = Cond.getOperand(2);
29707       ISD::CondCode NewCC =
29708           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
29709                                Cond.getOperand(0).getValueType().isInteger());
29710       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
29711                           NewCC);
29712       std::swap(LHS, RHS);
29713       TValIsAllOnes = FValIsAllOnes;
29714       FValIsAllZeros = TValIsAllZeros;
29715     }
29716   }
29717
29718   // vselect Cond, 111..., 000... -> Cond
29719   if (TValIsAllOnes && FValIsAllZeros)
29720     return DAG.getBitcast(VT, Cond);
29721
29722   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
29723     return SDValue();
29724
29725   // vselect Cond, 111..., X -> or Cond, X
29726   if (TValIsAllOnes) {
29727     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
29728     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
29729     return DAG.getBitcast(VT, Or);
29730   }
29731
29732   // vselect Cond, X, 000... -> and Cond, X
29733   if (FValIsAllZeros) {
29734     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
29735     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
29736     return DAG.getBitcast(VT, And);
29737   }
29738
29739   return SDValue();
29740 }
29741
29742 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
29743   SDValue Cond = N->getOperand(0);
29744   SDValue LHS = N->getOperand(1);
29745   SDValue RHS = N->getOperand(2);
29746   SDLoc DL(N);
29747
29748   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
29749   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
29750   if (!TrueC || !FalseC)
29751     return SDValue();
29752
29753   // Don't do this for crazy integer types.
29754   if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
29755     return SDValue();
29756
29757   // If this is efficiently invertible, canonicalize the LHSC/RHSC values
29758   // so that TrueC (the true value) is larger than FalseC.
29759   bool NeedsCondInvert = false;
29760   if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
29761       // Efficiently invertible.
29762       (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
29763        (Cond.getOpcode() == ISD::XOR &&  // xor(X, C) -> invertible.
29764         isa<ConstantSDNode>(Cond.getOperand(1))))) {
29765     NeedsCondInvert = true;
29766     std::swap(TrueC, FalseC);
29767   }
29768
29769   // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
29770   if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29771     if (NeedsCondInvert) // Invert the condition if needed.
29772       Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29773                          DAG.getConstant(1, DL, Cond.getValueType()));
29774
29775     // Zero extend the condition if needed.
29776     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
29777
29778     unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29779     return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
29780                        DAG.getConstant(ShAmt, DL, MVT::i8));
29781   }
29782
29783   // Optimize cases that will turn into an LEA instruction.  This requires
29784   // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29785   if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29786     uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
29787     if (N->getValueType(0) == MVT::i32)
29788       Diff = (unsigned)Diff;
29789
29790     bool isFastMultiplier = false;
29791     if (Diff < 10) {
29792       switch ((unsigned char)Diff) {
29793       default:
29794         break;
29795       case 1: // result = add base, cond
29796       case 2: // result = lea base(    , cond*2)
29797       case 3: // result = lea base(cond, cond*2)
29798       case 4: // result = lea base(    , cond*4)
29799       case 5: // result = lea base(cond, cond*4)
29800       case 8: // result = lea base(    , cond*8)
29801       case 9: // result = lea base(cond, cond*8)
29802         isFastMultiplier = true;
29803         break;
29804       }
29805     }
29806
29807     if (isFastMultiplier) {
29808       APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
29809       if (NeedsCondInvert) // Invert the condition if needed.
29810         Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
29811                            DAG.getConstant(1, DL, Cond.getValueType()));
29812
29813       // Zero extend the condition if needed.
29814       Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
29815       // Scale the condition by the difference.
29816       if (Diff != 1)
29817         Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29818                            DAG.getConstant(Diff, DL, Cond.getValueType()));
29819
29820       // Add the base if non-zero.
29821       if (FalseC->getAPIntValue() != 0)
29822         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29823                            SDValue(FalseC, 0));
29824       return Cond;
29825     }
29826   }
29827
29828   return SDValue();
29829 }
29830
29831 // If this is a bitcasted op that can be represented as another type, push the
29832 // the bitcast to the inputs. This allows more opportunities for pattern
29833 // matching masked instructions. This is called when we know that the operation
29834 // is used as one of the inputs of a vselect.
29835 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
29836                                       TargetLowering::DAGCombinerInfo &DCI) {
29837   // Make sure we have a bitcast.
29838   if (OrigOp.getOpcode() != ISD::BITCAST)
29839     return false;
29840
29841   SDValue Op = OrigOp.getOperand(0);
29842
29843   // If the operation is used by anything other than the bitcast, we shouldn't
29844   // do this combine as that would replicate the operation.
29845   if (!Op.hasOneUse())
29846     return false;
29847
29848   MVT VT = OrigOp.getSimpleValueType();
29849   MVT EltVT = VT.getVectorElementType();
29850   SDLoc DL(Op.getNode());
29851
29852   auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
29853                                       SDValue Op2) {
29854     Op0 = DAG.getBitcast(VT, Op0);
29855     DCI.AddToWorklist(Op0.getNode());
29856     Op1 = DAG.getBitcast(VT, Op1);
29857     DCI.AddToWorklist(Op1.getNode());
29858     DCI.CombineTo(OrigOp.getNode(),
29859                   DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29860     return true;
29861   };
29862
29863   unsigned Opcode = Op.getOpcode();
29864   switch (Opcode) {
29865   case X86ISD::PALIGNR:
29866     // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29867     if (!VT.is128BitVector())
29868       return false;
29869     Opcode = X86ISD::VALIGN;
29870     LLVM_FALLTHROUGH;
29871   case X86ISD::VALIGN: {
29872     if (EltVT != MVT::i32 && EltVT != MVT::i64)
29873       return false;
29874     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29875     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29876     unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29877     unsigned EltSize = EltVT.getSizeInBits();
29878     // Make sure we can represent the same shift with the new VT.
29879     if ((ShiftAmt % EltSize) != 0)
29880       return false;
29881     Imm = ShiftAmt / EltSize;
29882     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29883                                     DAG.getConstant(Imm, DL, MVT::i8));
29884   }
29885   case X86ISD::SHUF128: {
29886     if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29887       return false;
29888     // Only change element size, not type.
29889     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29890       return false;
29891     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29892                                     Op.getOperand(2));
29893   }
29894   case ISD::INSERT_SUBVECTOR: {
29895     unsigned EltSize = EltVT.getSizeInBits();
29896     if (EltSize != 32 && EltSize != 64)
29897       return false;
29898     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29899     // Only change element size, not type.
29900     if (EltVT.isInteger() != OpEltVT.isInteger())
29901       return false;
29902     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29903     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29904     SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29905     DCI.AddToWorklist(Op0.getNode());
29906     // Op1 needs to be bitcasted to a smaller vector with the same element type.
29907     SDValue Op1 = Op.getOperand(1);
29908     MVT Op1VT = MVT::getVectorVT(EltVT,
29909                             Op1.getSimpleValueType().getSizeInBits() / EltSize);
29910     Op1 = DAG.getBitcast(Op1VT, Op1);
29911     DCI.AddToWorklist(Op1.getNode());
29912     DCI.CombineTo(OrigOp.getNode(),
29913                   DAG.getNode(Opcode, DL, VT, Op0, Op1,
29914                               DAG.getIntPtrConstant(Imm, DL)));
29915     return true;
29916   }
29917   case ISD::EXTRACT_SUBVECTOR: {
29918     unsigned EltSize = EltVT.getSizeInBits();
29919     if (EltSize != 32 && EltSize != 64)
29920       return false;
29921     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29922     // Only change element size, not type.
29923     if (EltVT.isInteger() != OpEltVT.isInteger())
29924       return false;
29925     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29926     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29927     // Op0 needs to be bitcasted to a larger vector with the same element type.
29928     SDValue Op0 = Op.getOperand(0);
29929     MVT Op0VT = MVT::getVectorVT(EltVT,
29930                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
29931     Op0 = DAG.getBitcast(Op0VT, Op0);
29932     DCI.AddToWorklist(Op0.getNode());
29933     DCI.CombineTo(OrigOp.getNode(),
29934                   DAG.getNode(Opcode, DL, VT, Op0,
29935                               DAG.getIntPtrConstant(Imm, DL)));
29936     return true;
29937   }
29938   case X86ISD::SUBV_BROADCAST: {
29939     unsigned EltSize = EltVT.getSizeInBits();
29940     if (EltSize != 32 && EltSize != 64)
29941       return false;
29942     // Only change element size, not type.
29943     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29944       return false;
29945     SDValue Op0 = Op.getOperand(0);
29946     MVT Op0VT = MVT::getVectorVT(EltVT,
29947                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
29948     Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
29949     DCI.AddToWorklist(Op0.getNode());
29950     DCI.CombineTo(OrigOp.getNode(),
29951                   DAG.getNode(Opcode, DL, VT, Op0));
29952     return true;
29953   }
29954   }
29955
29956   return false;
29957 }
29958
29959 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29960 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29961                              TargetLowering::DAGCombinerInfo &DCI,
29962                              const X86Subtarget &Subtarget) {
29963   SDLoc DL(N);
29964   SDValue Cond = N->getOperand(0);
29965   // Get the LHS/RHS of the select.
29966   SDValue LHS = N->getOperand(1);
29967   SDValue RHS = N->getOperand(2);
29968   EVT VT = LHS.getValueType();
29969   EVT CondVT = Cond.getValueType();
29970   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29971
29972   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29973   // instructions match the semantics of the common C idiom x<y?x:y but not
29974   // x<=y?x:y, because of how they handle negative zero (which can be
29975   // ignored in unsafe-math mode).
29976   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29977   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29978       VT != MVT::f80 && VT != MVT::f128 &&
29979       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29980       (Subtarget.hasSSE2() ||
29981        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29982     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29983
29984     unsigned Opcode = 0;
29985     // Check for x CC y ? x : y.
29986     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29987         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29988       switch (CC) {
29989       default: break;
29990       case ISD::SETULT:
29991         // Converting this to a min would handle NaNs incorrectly, and swapping
29992         // the operands would cause it to handle comparisons between positive
29993         // and negative zero incorrectly.
29994         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29995           if (!DAG.getTarget().Options.UnsafeFPMath &&
29996               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29997             break;
29998           std::swap(LHS, RHS);
29999         }
30000         Opcode = X86ISD::FMIN;
30001         break;
30002       case ISD::SETOLE:
30003         // Converting this to a min would handle comparisons between positive
30004         // and negative zero incorrectly.
30005         if (!DAG.getTarget().Options.UnsafeFPMath &&
30006             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30007           break;
30008         Opcode = X86ISD::FMIN;
30009         break;
30010       case ISD::SETULE:
30011         // Converting this to a min would handle both negative zeros and NaNs
30012         // incorrectly, but we can swap the operands to fix both.
30013         std::swap(LHS, RHS);
30014         LLVM_FALLTHROUGH;
30015       case ISD::SETOLT:
30016       case ISD::SETLT:
30017       case ISD::SETLE:
30018         Opcode = X86ISD::FMIN;
30019         break;
30020
30021       case ISD::SETOGE:
30022         // Converting this to a max would handle comparisons between positive
30023         // and negative zero incorrectly.
30024         if (!DAG.getTarget().Options.UnsafeFPMath &&
30025             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30026           break;
30027         Opcode = X86ISD::FMAX;
30028         break;
30029       case ISD::SETUGT:
30030         // Converting this to a max would handle NaNs incorrectly, and swapping
30031         // the operands would cause it to handle comparisons between positive
30032         // and negative zero incorrectly.
30033         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30034           if (!DAG.getTarget().Options.UnsafeFPMath &&
30035               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30036             break;
30037           std::swap(LHS, RHS);
30038         }
30039         Opcode = X86ISD::FMAX;
30040         break;
30041       case ISD::SETUGE:
30042         // Converting this to a max would handle both negative zeros and NaNs
30043         // incorrectly, but we can swap the operands to fix both.
30044         std::swap(LHS, RHS);
30045         LLVM_FALLTHROUGH;
30046       case ISD::SETOGT:
30047       case ISD::SETGT:
30048       case ISD::SETGE:
30049         Opcode = X86ISD::FMAX;
30050         break;
30051       }
30052     // Check for x CC y ? y : x -- a min/max with reversed arms.
30053     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30054                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30055       switch (CC) {
30056       default: break;
30057       case ISD::SETOGE:
30058         // Converting this to a min would handle comparisons between positive
30059         // and negative zero incorrectly, and swapping the operands would
30060         // cause it to handle NaNs incorrectly.
30061         if (!DAG.getTarget().Options.UnsafeFPMath &&
30062             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30063           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30064             break;
30065           std::swap(LHS, RHS);
30066         }
30067         Opcode = X86ISD::FMIN;
30068         break;
30069       case ISD::SETUGT:
30070         // Converting this to a min would handle NaNs incorrectly.
30071         if (!DAG.getTarget().Options.UnsafeFPMath &&
30072             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30073           break;
30074         Opcode = X86ISD::FMIN;
30075         break;
30076       case ISD::SETUGE:
30077         // Converting this to a min would handle both negative zeros and NaNs
30078         // incorrectly, but we can swap the operands to fix both.
30079         std::swap(LHS, RHS);
30080         LLVM_FALLTHROUGH;
30081       case ISD::SETOGT:
30082       case ISD::SETGT:
30083       case ISD::SETGE:
30084         Opcode = X86ISD::FMIN;
30085         break;
30086
30087       case ISD::SETULT:
30088         // Converting this to a max would handle NaNs incorrectly.
30089         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30090           break;
30091         Opcode = X86ISD::FMAX;
30092         break;
30093       case ISD::SETOLE:
30094         // Converting this to a max would handle comparisons between positive
30095         // and negative zero incorrectly, and swapping the operands would
30096         // cause it to handle NaNs incorrectly.
30097         if (!DAG.getTarget().Options.UnsafeFPMath &&
30098             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30099           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30100             break;
30101           std::swap(LHS, RHS);
30102         }
30103         Opcode = X86ISD::FMAX;
30104         break;
30105       case ISD::SETULE:
30106         // Converting this to a max would handle both negative zeros and NaNs
30107         // incorrectly, but we can swap the operands to fix both.
30108         std::swap(LHS, RHS);
30109         LLVM_FALLTHROUGH;
30110       case ISD::SETOLT:
30111       case ISD::SETLT:
30112       case ISD::SETLE:
30113         Opcode = X86ISD::FMAX;
30114         break;
30115       }
30116     }
30117
30118     if (Opcode)
30119       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30120   }
30121
30122   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30123   // lowering on KNL. In this case we convert it to
30124   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30125   // The same situation for all 128 and 256-bit vectors of i8 and i16.
30126   // Since SKX these selects have a proper lowering.
30127   if (Subtarget.hasAVX512() && CondVT.isVector() &&
30128       CondVT.getVectorElementType() == MVT::i1 &&
30129       (VT.is128BitVector() || VT.is256BitVector()) &&
30130       (VT.getVectorElementType() == MVT::i8 ||
30131        VT.getVectorElementType() == MVT::i16) &&
30132       !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30133     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30134     DCI.AddToWorklist(Cond.getNode());
30135     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30136   }
30137
30138   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30139     return V;
30140
30141   // Canonicalize max and min:
30142   // (x > y) ? x : y -> (x >= y) ? x : y
30143   // (x < y) ? x : y -> (x <= y) ? x : y
30144   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30145   // the need for an extra compare
30146   // against zero. e.g.
30147   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30148   // subl   %esi, %edi
30149   // testl  %edi, %edi
30150   // movl   $0, %eax
30151   // cmovgl %edi, %eax
30152   // =>
30153   // xorl   %eax, %eax
30154   // subl   %esi, $edi
30155   // cmovsl %eax, %edi
30156   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30157       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30158       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30159     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30160     switch (CC) {
30161     default: break;
30162     case ISD::SETLT:
30163     case ISD::SETGT: {
30164       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30165       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30166                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
30167       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30168     }
30169     }
30170   }
30171
30172   // Early exit check
30173   if (!TLI.isTypeLegal(VT))
30174     return SDValue();
30175
30176   // Match VSELECTs into subs with unsigned saturation.
30177   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30178       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30179       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30180        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30181     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30182
30183     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30184     // left side invert the predicate to simplify logic below.
30185     SDValue Other;
30186     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30187       Other = RHS;
30188       CC = ISD::getSetCCInverse(CC, true);
30189     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30190       Other = LHS;
30191     }
30192
30193     if (Other.getNode() && Other->getNumOperands() == 2 &&
30194         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30195       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30196       SDValue CondRHS = Cond->getOperand(1);
30197
30198       // Look for a general sub with unsigned saturation first.
30199       // x >= y ? x-y : 0 --> subus x, y
30200       // x >  y ? x-y : 0 --> subus x, y
30201       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30202           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30203         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30204
30205       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30206         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30207           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30208             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30209               // If the RHS is a constant we have to reverse the const
30210               // canonicalization.
30211               // x > C-1 ? x+-C : 0 --> subus x, C
30212               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30213                   CondRHSConst->getAPIntValue() ==
30214                       (-OpRHSConst->getAPIntValue() - 1))
30215                 return DAG.getNode(
30216                     X86ISD::SUBUS, DL, VT, OpLHS,
30217                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30218
30219           // Another special case: If C was a sign bit, the sub has been
30220           // canonicalized into a xor.
30221           // FIXME: Would it be better to use computeKnownBits to determine
30222           //        whether it's safe to decanonicalize the xor?
30223           // x s< 0 ? x^C : 0 --> subus x, C
30224           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30225               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30226               OpRHSConst->getAPIntValue().isSignMask())
30227             // Note that we have to rebuild the RHS constant here to ensure we
30228             // don't rely on particular values of undef lanes.
30229             return DAG.getNode(
30230                 X86ISD::SUBUS, DL, VT, OpLHS,
30231                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30232         }
30233     }
30234   }
30235
30236   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30237     return V;
30238
30239   // If this is a *dynamic* select (non-constant condition) and we can match
30240   // this node with one of the variable blend instructions, restructure the
30241   // condition so that blends can use the high (sign) bit of each element and
30242   // use SimplifyDemandedBits to simplify the condition operand.
30243   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30244       !DCI.isBeforeLegalize() &&
30245       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30246     unsigned BitWidth = Cond.getScalarValueSizeInBits();
30247
30248     // Don't optimize vector selects that map to mask-registers.
30249     if (BitWidth == 1)
30250       return SDValue();
30251
30252     // We can only handle the cases where VSELECT is directly legal on the
30253     // subtarget. We custom lower VSELECT nodes with constant conditions and
30254     // this makes it hard to see whether a dynamic VSELECT will correctly
30255     // lower, so we both check the operation's status and explicitly handle the
30256     // cases where a *dynamic* blend will fail even though a constant-condition
30257     // blend could be custom lowered.
30258     // FIXME: We should find a better way to handle this class of problems.
30259     // Potentially, we should combine constant-condition vselect nodes
30260     // pre-legalization into shuffles and not mark as many types as custom
30261     // lowered.
30262     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30263       return SDValue();
30264     // FIXME: We don't support i16-element blends currently. We could and
30265     // should support them by making *all* the bits in the condition be set
30266     // rather than just the high bit and using an i8-element blend.
30267     if (VT.getVectorElementType() == MVT::i16)
30268       return SDValue();
30269     // Dynamic blending was only available from SSE4.1 onward.
30270     if (VT.is128BitVector() && !Subtarget.hasSSE41())
30271       return SDValue();
30272     // Byte blends are only available in AVX2
30273     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30274       return SDValue();
30275
30276     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30277     APInt DemandedMask(APInt::getSignMask(BitWidth));
30278     KnownBits Known;
30279     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
30280                                           DCI.isBeforeLegalizeOps());
30281     if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30282         TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30283       // If we changed the computation somewhere in the DAG, this change will
30284       // affect all users of Cond. Make sure it is fine and update all the nodes
30285       // so that we do not use the generic VSELECT anymore. Otherwise, we may
30286       // perform wrong optimizations as we messed with the actual expectation
30287       // for the vector boolean values.
30288       if (Cond != TLO.Old) {
30289         // Check all uses of the condition operand to check whether it will be
30290         // consumed by non-BLEND instructions. Those may require that all bits
30291         // are set properly.
30292         for (SDNode *U : Cond->uses()) {
30293           // TODO: Add other opcodes eventually lowered into BLEND.
30294           if (U->getOpcode() != ISD::VSELECT)
30295             return SDValue();
30296         }
30297
30298         // Update all users of the condition before committing the change, so
30299         // that the VSELECT optimizations that expect the correct vector boolean
30300         // value will not be triggered.
30301         for (SDNode *U : Cond->uses()) {
30302           SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30303                                    U->getValueType(0), Cond, U->getOperand(1),
30304                                    U->getOperand(2));
30305           DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30306         }
30307         DCI.CommitTargetLoweringOpt(TLO);
30308         return SDValue();
30309       }
30310       // Only Cond (rather than other nodes in the computation chain) was
30311       // changed. Change the condition just for N to keep the opportunity to
30312       // optimize all other users their own way.
30313       SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30314       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30315       return SDValue();
30316     }
30317   }
30318
30319   // Look for vselects with LHS/RHS being bitcasted from an operation that
30320   // can be executed on another type. Push the bitcast to the inputs of
30321   // the operation. This exposes opportunities for using masking instructions.
30322   if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30323       CondVT.getVectorElementType() == MVT::i1) {
30324     if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30325       return SDValue(N, 0);
30326     if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30327       return SDValue(N, 0);
30328   }
30329
30330   return SDValue();
30331 }
30332
30333 /// Combine:
30334 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30335 /// to:
30336 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30337 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30338 /// Note that this is only legal for some op/cc combinations.
30339 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30340                                        SelectionDAG &DAG) {
30341   // This combine only operates on CMP-like nodes.
30342   if (!(Cmp.getOpcode() == X86ISD::CMP ||
30343         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30344     return SDValue();
30345
30346   // Can't replace the cmp if it has more uses than the one we're looking at.
30347   // FIXME: We would like to be able to handle this, but would need to make sure
30348   // all uses were updated.
30349   if (!Cmp.hasOneUse())
30350     return SDValue();
30351
30352   // This only applies to variations of the common case:
30353   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30354   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30355   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30356   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30357   // Using the proper condcodes (see below), overflow is checked for.
30358
30359   // FIXME: We can generalize both constraints:
30360   // - XOR/OR/AND (if they were made to survive AtomicExpand)
30361   // - LHS != 1
30362   // if the result is compared.
30363
30364   SDValue CmpLHS = Cmp.getOperand(0);
30365   SDValue CmpRHS = Cmp.getOperand(1);
30366
30367   if (!CmpLHS.hasOneUse())
30368     return SDValue();
30369
30370   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30371   if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30372     return SDValue();
30373
30374   const unsigned Opc = CmpLHS.getOpcode();
30375
30376   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30377     return SDValue();
30378
30379   SDValue OpRHS = CmpLHS.getOperand(2);
30380   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30381   if (!OpRHSC)
30382     return SDValue();
30383
30384   APInt Addend = OpRHSC->getAPIntValue();
30385   if (Opc == ISD::ATOMIC_LOAD_SUB)
30386     Addend = -Addend;
30387
30388   if (CC == X86::COND_S && Addend == 1)
30389     CC = X86::COND_LE;
30390   else if (CC == X86::COND_NS && Addend == 1)
30391     CC = X86::COND_G;
30392   else if (CC == X86::COND_G && Addend == -1)
30393     CC = X86::COND_GE;
30394   else if (CC == X86::COND_LE && Addend == -1)
30395     CC = X86::COND_L;
30396   else
30397     return SDValue();
30398
30399   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30400   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30401                                 DAG.getUNDEF(CmpLHS.getValueType()));
30402   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30403   return LockOp;
30404 }
30405
30406 // Check whether a boolean test is testing a boolean value generated by
30407 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30408 // code.
30409 //
30410 // Simplify the following patterns:
30411 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30412 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30413 // to (Op EFLAGS Cond)
30414 //
30415 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30416 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30417 // to (Op EFLAGS !Cond)
30418 //
30419 // where Op could be BRCOND or CMOV.
30420 //
30421 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30422   // This combine only operates on CMP-like nodes.
30423   if (!(Cmp.getOpcode() == X86ISD::CMP ||
30424         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30425     return SDValue();
30426
30427   // Quit if not used as a boolean value.
30428   if (CC != X86::COND_E && CC != X86::COND_NE)
30429     return SDValue();
30430
30431   // Check CMP operands. One of them should be 0 or 1 and the other should be
30432   // an SetCC or extended from it.
30433   SDValue Op1 = Cmp.getOperand(0);
30434   SDValue Op2 = Cmp.getOperand(1);
30435
30436   SDValue SetCC;
30437   const ConstantSDNode* C = nullptr;
30438   bool needOppositeCond = (CC == X86::COND_E);
30439   bool checkAgainstTrue = false; // Is it a comparison against 1?
30440
30441   if ((C = dyn_cast<ConstantSDNode>(Op1)))
30442     SetCC = Op2;
30443   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30444     SetCC = Op1;
30445   else // Quit if all operands are not constants.
30446     return SDValue();
30447
30448   if (C->getZExtValue() == 1) {
30449     needOppositeCond = !needOppositeCond;
30450     checkAgainstTrue = true;
30451   } else if (C->getZExtValue() != 0)
30452     // Quit if the constant is neither 0 or 1.
30453     return SDValue();
30454
30455   bool truncatedToBoolWithAnd = false;
30456   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30457   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30458          SetCC.getOpcode() == ISD::TRUNCATE ||
30459          SetCC.getOpcode() == ISD::AND) {
30460     if (SetCC.getOpcode() == ISD::AND) {
30461       int OpIdx = -1;
30462       if (isOneConstant(SetCC.getOperand(0)))
30463         OpIdx = 1;
30464       if (isOneConstant(SetCC.getOperand(1)))
30465         OpIdx = 0;
30466       if (OpIdx < 0)
30467         break;
30468       SetCC = SetCC.getOperand(OpIdx);
30469       truncatedToBoolWithAnd = true;
30470     } else
30471       SetCC = SetCC.getOperand(0);
30472   }
30473
30474   switch (SetCC.getOpcode()) {
30475   case X86ISD::SETCC_CARRY:
30476     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30477     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30478     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30479     // truncated to i1 using 'and'.
30480     if (checkAgainstTrue && !truncatedToBoolWithAnd)
30481       break;
30482     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30483            "Invalid use of SETCC_CARRY!");
30484     LLVM_FALLTHROUGH;
30485   case X86ISD::SETCC:
30486     // Set the condition code or opposite one if necessary.
30487     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30488     if (needOppositeCond)
30489       CC = X86::GetOppositeBranchCondition(CC);
30490     return SetCC.getOperand(1);
30491   case X86ISD::CMOV: {
30492     // Check whether false/true value has canonical one, i.e. 0 or 1.
30493     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30494     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30495     // Quit if true value is not a constant.
30496     if (!TVal)
30497       return SDValue();
30498     // Quit if false value is not a constant.
30499     if (!FVal) {
30500       SDValue Op = SetCC.getOperand(0);
30501       // Skip 'zext' or 'trunc' node.
30502       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30503           Op.getOpcode() == ISD::TRUNCATE)
30504         Op = Op.getOperand(0);
30505       // A special case for rdrand/rdseed, where 0 is set if false cond is
30506       // found.
30507       if ((Op.getOpcode() != X86ISD::RDRAND &&
30508            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30509         return SDValue();
30510     }
30511     // Quit if false value is not the constant 0 or 1.
30512     bool FValIsFalse = true;
30513     if (FVal && FVal->getZExtValue() != 0) {
30514       if (FVal->getZExtValue() != 1)
30515         return SDValue();
30516       // If FVal is 1, opposite cond is needed.
30517       needOppositeCond = !needOppositeCond;
30518       FValIsFalse = false;
30519     }
30520     // Quit if TVal is not the constant opposite of FVal.
30521     if (FValIsFalse && TVal->getZExtValue() != 1)
30522       return SDValue();
30523     if (!FValIsFalse && TVal->getZExtValue() != 0)
30524       return SDValue();
30525     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30526     if (needOppositeCond)
30527       CC = X86::GetOppositeBranchCondition(CC);
30528     return SetCC.getOperand(3);
30529   }
30530   }
30531
30532   return SDValue();
30533 }
30534
30535 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30536 /// Match:
30537 ///   (X86or (X86setcc) (X86setcc))
30538 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
30539 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30540                                            X86::CondCode &CC1, SDValue &Flags,
30541                                            bool &isAnd) {
30542   if (Cond->getOpcode() == X86ISD::CMP) {
30543     if (!isNullConstant(Cond->getOperand(1)))
30544       return false;
30545
30546     Cond = Cond->getOperand(0);
30547   }
30548
30549   isAnd = false;
30550
30551   SDValue SetCC0, SetCC1;
30552   switch (Cond->getOpcode()) {
30553   default: return false;
30554   case ISD::AND:
30555   case X86ISD::AND:
30556     isAnd = true;
30557     LLVM_FALLTHROUGH;
30558   case ISD::OR:
30559   case X86ISD::OR:
30560     SetCC0 = Cond->getOperand(0);
30561     SetCC1 = Cond->getOperand(1);
30562     break;
30563   };
30564
30565   // Make sure we have SETCC nodes, using the same flags value.
30566   if (SetCC0.getOpcode() != X86ISD::SETCC ||
30567       SetCC1.getOpcode() != X86ISD::SETCC ||
30568       SetCC0->getOperand(1) != SetCC1->getOperand(1))
30569     return false;
30570
30571   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30572   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30573   Flags = SetCC0->getOperand(1);
30574   return true;
30575 }
30576
30577 /// Optimize an EFLAGS definition used according to the condition code \p CC
30578 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30579 /// uses of chain values.
30580 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30581                                   SelectionDAG &DAG) {
30582   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30583     return R;
30584   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30585 }
30586
30587 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30588 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30589                            TargetLowering::DAGCombinerInfo &DCI,
30590                            const X86Subtarget &Subtarget) {
30591   SDLoc DL(N);
30592
30593   // If the flag operand isn't dead, don't touch this CMOV.
30594   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30595     return SDValue();
30596
30597   SDValue FalseOp = N->getOperand(0);
30598   SDValue TrueOp = N->getOperand(1);
30599   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30600   SDValue Cond = N->getOperand(3);
30601
30602   if (CC == X86::COND_E || CC == X86::COND_NE) {
30603     switch (Cond.getOpcode()) {
30604     default: break;
30605     case X86ISD::BSR:
30606     case X86ISD::BSF:
30607       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30608       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30609         return (CC == X86::COND_E) ? FalseOp : TrueOp;
30610     }
30611   }
30612
30613   // Try to simplify the EFLAGS and condition code operands.
30614   // We can't always do this as FCMOV only supports a subset of X86 cond.
30615   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30616     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30617       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30618         Flags};
30619       return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30620     }
30621   }
30622
30623   // If this is a select between two integer constants, try to do some
30624   // optimizations.  Note that the operands are ordered the opposite of SELECT
30625   // operands.
30626   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30627     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30628       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30629       // larger than FalseC (the false value).
30630       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30631         CC = X86::GetOppositeBranchCondition(CC);
30632         std::swap(TrueC, FalseC);
30633         std::swap(TrueOp, FalseOp);
30634       }
30635
30636       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
30637       // This is efficient for any integer data type (including i8/i16) and
30638       // shift amount.
30639       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30640         Cond = getSETCC(CC, Cond, DL, DAG);
30641
30642         // Zero extend the condition if needed.
30643         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30644
30645         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30646         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30647                            DAG.getConstant(ShAmt, DL, MVT::i8));
30648         if (N->getNumValues() == 2)  // Dead flag value?
30649           return DCI.CombineTo(N, Cond, SDValue());
30650         return Cond;
30651       }
30652
30653       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
30654       // for any integer data type, including i8/i16.
30655       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30656         Cond = getSETCC(CC, Cond, DL, DAG);
30657
30658         // Zero extend the condition if needed.
30659         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30660                            FalseC->getValueType(0), Cond);
30661         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30662                            SDValue(FalseC, 0));
30663
30664         if (N->getNumValues() == 2)  // Dead flag value?
30665           return DCI.CombineTo(N, Cond, SDValue());
30666         return Cond;
30667       }
30668
30669       // Optimize cases that will turn into an LEA instruction.  This requires
30670       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30671       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30672         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
30673         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
30674
30675         bool isFastMultiplier = false;
30676         if (Diff < 10) {
30677           switch ((unsigned char)Diff) {
30678           default: break;
30679           case 1:  // result = add base, cond
30680           case 2:  // result = lea base(    , cond*2)
30681           case 3:  // result = lea base(cond, cond*2)
30682           case 4:  // result = lea base(    , cond*4)
30683           case 5:  // result = lea base(cond, cond*4)
30684           case 8:  // result = lea base(    , cond*8)
30685           case 9:  // result = lea base(cond, cond*8)
30686             isFastMultiplier = true;
30687             break;
30688           }
30689         }
30690
30691         if (isFastMultiplier) {
30692           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
30693           Cond = getSETCC(CC, Cond, DL ,DAG);
30694           // Zero extend the condition if needed.
30695           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
30696                              Cond);
30697           // Scale the condition by the difference.
30698           if (Diff != 1)
30699             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30700                                DAG.getConstant(Diff, DL, Cond.getValueType()));
30701
30702           // Add the base if non-zero.
30703           if (FalseC->getAPIntValue() != 0)
30704             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30705                                SDValue(FalseC, 0));
30706           if (N->getNumValues() == 2)  // Dead flag value?
30707             return DCI.CombineTo(N, Cond, SDValue());
30708           return Cond;
30709         }
30710       }
30711     }
30712   }
30713
30714   // Handle these cases:
30715   //   (select (x != c), e, c) -> select (x != c), e, x),
30716   //   (select (x == c), c, e) -> select (x == c), x, e)
30717   // where the c is an integer constant, and the "select" is the combination
30718   // of CMOV and CMP.
30719   //
30720   // The rationale for this change is that the conditional-move from a constant
30721   // needs two instructions, however, conditional-move from a register needs
30722   // only one instruction.
30723   //
30724   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
30725   //  some instruction-combining opportunities. This opt needs to be
30726   //  postponed as late as possible.
30727   //
30728   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
30729     // the DCI.xxxx conditions are provided to postpone the optimization as
30730     // late as possible.
30731
30732     ConstantSDNode *CmpAgainst = nullptr;
30733     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
30734         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
30735         !isa<ConstantSDNode>(Cond.getOperand(0))) {
30736
30737       if (CC == X86::COND_NE &&
30738           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
30739         CC = X86::GetOppositeBranchCondition(CC);
30740         std::swap(TrueOp, FalseOp);
30741       }
30742
30743       if (CC == X86::COND_E &&
30744           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
30745         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
30746                           DAG.getConstant(CC, DL, MVT::i8), Cond };
30747         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
30748       }
30749     }
30750   }
30751
30752   // Fold and/or of setcc's to double CMOV:
30753   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
30754   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
30755   //
30756   // This combine lets us generate:
30757   //   cmovcc1 (jcc1 if we don't have CMOV)
30758   //   cmovcc2 (same)
30759   // instead of:
30760   //   setcc1
30761   //   setcc2
30762   //   and/or
30763   //   cmovne (jne if we don't have CMOV)
30764   // When we can't use the CMOV instruction, it might increase branch
30765   // mispredicts.
30766   // When we can use CMOV, or when there is no mispredict, this improves
30767   // throughput and reduces register pressure.
30768   //
30769   if (CC == X86::COND_NE) {
30770     SDValue Flags;
30771     X86::CondCode CC0, CC1;
30772     bool isAndSetCC;
30773     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
30774       if (isAndSetCC) {
30775         std::swap(FalseOp, TrueOp);
30776         CC0 = X86::GetOppositeBranchCondition(CC0);
30777         CC1 = X86::GetOppositeBranchCondition(CC1);
30778       }
30779
30780       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
30781         Flags};
30782       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
30783       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
30784       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30785       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
30786       return CMOV;
30787     }
30788   }
30789
30790   return SDValue();
30791 }
30792
30793 /// Different mul shrinking modes.
30794 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
30795
30796 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
30797   EVT VT = N->getOperand(0).getValueType();
30798   if (VT.getScalarSizeInBits() != 32)
30799     return false;
30800
30801   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
30802   unsigned SignBits[2] = {1, 1};
30803   bool IsPositive[2] = {false, false};
30804   for (unsigned i = 0; i < 2; i++) {
30805     SDValue Opd = N->getOperand(i);
30806
30807     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
30808     // compute signbits for it separately.
30809     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
30810       // For anyextend, it is safe to assume an appropriate number of leading
30811       // sign/zero bits.
30812       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
30813         SignBits[i] = 25;
30814       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
30815                MVT::i16)
30816         SignBits[i] = 17;
30817       else
30818         return false;
30819       IsPositive[i] = true;
30820     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
30821       // All the operands of BUILD_VECTOR need to be int constant.
30822       // Find the smallest value range which all the operands belong to.
30823       SignBits[i] = 32;
30824       IsPositive[i] = true;
30825       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
30826         if (SubOp.isUndef())
30827           continue;
30828         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
30829         if (!CN)
30830           return false;
30831         APInt IntVal = CN->getAPIntValue();
30832         if (IntVal.isNegative())
30833           IsPositive[i] = false;
30834         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
30835       }
30836     } else {
30837       SignBits[i] = DAG.ComputeNumSignBits(Opd);
30838       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
30839         IsPositive[i] = true;
30840     }
30841   }
30842
30843   bool AllPositive = IsPositive[0] && IsPositive[1];
30844   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
30845   // When ranges are from -128 ~ 127, use MULS8 mode.
30846   if (MinSignBits >= 25)
30847     Mode = MULS8;
30848   // When ranges are from 0 ~ 255, use MULU8 mode.
30849   else if (AllPositive && MinSignBits >= 24)
30850     Mode = MULU8;
30851   // When ranges are from -32768 ~ 32767, use MULS16 mode.
30852   else if (MinSignBits >= 17)
30853     Mode = MULS16;
30854   // When ranges are from 0 ~ 65535, use MULU16 mode.
30855   else if (AllPositive && MinSignBits >= 16)
30856     Mode = MULU16;
30857   else
30858     return false;
30859   return true;
30860 }
30861
30862 /// When the operands of vector mul are extended from smaller size values,
30863 /// like i8 and i16, the type of mul may be shrinked to generate more
30864 /// efficient code. Two typical patterns are handled:
30865 /// Pattern1:
30866 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
30867 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
30868 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30869 ///     %5 = mul <N x i32> %2, %4
30870 ///
30871 /// Pattern2:
30872 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
30873 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
30874 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30875 ///     %5 = mul <N x i32> %2, %4
30876 ///
30877 /// There are four mul shrinking modes:
30878 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30879 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30880 /// generate pmullw+sext32 for it (MULS8 mode).
30881 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30882 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30883 /// generate pmullw+zext32 for it (MULU8 mode).
30884 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30885 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30886 /// generate pmullw+pmulhw for it (MULS16 mode).
30887 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30888 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30889 /// generate pmullw+pmulhuw for it (MULU16 mode).
30890 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30891                                const X86Subtarget &Subtarget) {
30892   // Check for legality
30893   // pmullw/pmulhw are not supported by SSE.
30894   if (!Subtarget.hasSSE2())
30895     return SDValue();
30896
30897   // Check for profitability
30898   // pmulld is supported since SSE41. It is better to use pmulld
30899   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30900   // the expansion.
30901   bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30902   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30903     return SDValue();
30904
30905   ShrinkMode Mode;
30906   if (!canReduceVMulWidth(N, DAG, Mode))
30907     return SDValue();
30908
30909   SDLoc DL(N);
30910   SDValue N0 = N->getOperand(0);
30911   SDValue N1 = N->getOperand(1);
30912   EVT VT = N->getOperand(0).getValueType();
30913   unsigned RegSize = 128;
30914   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30915   EVT ReducedVT =
30916       EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30917   // Shrink the operands of mul.
30918   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30919   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30920
30921   if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30922     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30923     // lower part is needed.
30924     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30925     if (Mode == MULU8 || Mode == MULS8) {
30926       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30927                          DL, VT, MulLo);
30928     } else {
30929       MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30930       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30931       // the higher part is also needed.
30932       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30933                                   ReducedVT, NewN0, NewN1);
30934
30935       // Repack the lower part and higher part result of mul into a wider
30936       // result.
30937       // Generate shuffle functioning as punpcklwd.
30938       SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30939       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30940         ShuffleMask[2 * i] = i;
30941         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30942       }
30943       SDValue ResLo =
30944           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30945       ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30946       // Generate shuffle functioning as punpckhwd.
30947       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30948         ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30949         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30950       }
30951       SDValue ResHi =
30952           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30953       ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30954       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30955     }
30956   } else {
30957     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30958     // to legalize the mul explicitly because implicit legalization for type
30959     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30960     // instructions which will not exist when we explicitly legalize it by
30961     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30962     // <4 x i16> undef).
30963     //
30964     // Legalize the operands of mul.
30965     // FIXME: We may be able to handle non-concatenated vectors by insertion.
30966     unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30967     if ((RegSize % ReducedSizeInBits) != 0)
30968       return SDValue();
30969
30970     SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30971                                  DAG.getUNDEF(ReducedVT));
30972     Ops[0] = NewN0;
30973     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30974     Ops[0] = NewN1;
30975     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30976
30977     if (Mode == MULU8 || Mode == MULS8) {
30978       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30979       // part is needed.
30980       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30981
30982       // convert the type of mul result to VT.
30983       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30984       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30985                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
30986                                 DL, ResVT, Mul);
30987       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30988                          DAG.getIntPtrConstant(0, DL));
30989     } else {
30990       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30991       // MULU16/MULS16, both parts are needed.
30992       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30993       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30994                                   OpsVT, NewN0, NewN1);
30995
30996       // Repack the lower part and higher part result of mul into a wider
30997       // result. Make sure the type of mul result is VT.
30998       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30999       SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31000       Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31001       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31002                          DAG.getIntPtrConstant(0, DL));
31003     }
31004   }
31005 }
31006
31007 /// Optimize a single multiply with constant into two operations in order to
31008 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31009 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31010                           TargetLowering::DAGCombinerInfo &DCI,
31011                           const X86Subtarget &Subtarget) {
31012   EVT VT = N->getValueType(0);
31013   if (DCI.isBeforeLegalize() && VT.isVector())
31014     return reduceVMULWidth(N, DAG, Subtarget);
31015
31016   // An imul is usually smaller than the alternative sequence.
31017   if (DAG.getMachineFunction().getFunction()->optForMinSize())
31018     return SDValue();
31019
31020   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31021     return SDValue();
31022
31023   if (VT != MVT::i64 && VT != MVT::i32)
31024     return SDValue();
31025
31026   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31027   if (!C)
31028     return SDValue();
31029   uint64_t MulAmt = C->getZExtValue();
31030   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31031     return SDValue();
31032
31033   uint64_t MulAmt1 = 0;
31034   uint64_t MulAmt2 = 0;
31035   if ((MulAmt % 9) == 0) {
31036     MulAmt1 = 9;
31037     MulAmt2 = MulAmt / 9;
31038   } else if ((MulAmt % 5) == 0) {
31039     MulAmt1 = 5;
31040     MulAmt2 = MulAmt / 5;
31041   } else if ((MulAmt % 3) == 0) {
31042     MulAmt1 = 3;
31043     MulAmt2 = MulAmt / 3;
31044   }
31045
31046   SDLoc DL(N);
31047   SDValue NewMul;
31048   if (MulAmt2 &&
31049       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31050
31051     if (isPowerOf2_64(MulAmt2) &&
31052         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31053       // If second multiplifer is pow2, issue it first. We want the multiply by
31054       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31055       // is an add.
31056       std::swap(MulAmt1, MulAmt2);
31057
31058     if (isPowerOf2_64(MulAmt1))
31059       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31060                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31061     else
31062       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31063                            DAG.getConstant(MulAmt1, DL, VT));
31064
31065     if (isPowerOf2_64(MulAmt2))
31066       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31067                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31068     else
31069       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31070                            DAG.getConstant(MulAmt2, DL, VT));
31071   }
31072
31073   if (!NewMul) {
31074     assert(MulAmt != 0 &&
31075            MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31076            "Both cases that could cause potential overflows should have "
31077            "already been handled.");
31078     int64_t SignMulAmt = C->getSExtValue();
31079     if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31080         (SignMulAmt != -INT64_MAX)) {
31081       int NumSign = SignMulAmt > 0 ? 1 : -1;
31082       bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31083       bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31084       if (IsPowerOf2_64PlusOne) {
31085         // (mul x, 2^N + 1) => (add (shl x, N), x)
31086         NewMul = DAG.getNode(
31087             ISD::ADD, DL, VT, N->getOperand(0),
31088             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31089                         DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31090                                         MVT::i8)));
31091       } else if (IsPowerOf2_64MinusOne) {
31092         // (mul x, 2^N - 1) => (sub (shl x, N), x)
31093         NewMul = DAG.getNode(
31094             ISD::SUB, DL, VT,
31095             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31096                         DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31097                                         MVT::i8)),
31098             N->getOperand(0));
31099       }
31100       // To negate, subtract the number from zero
31101       if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31102         NewMul =
31103             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31104     }
31105   }
31106
31107   if (NewMul)
31108     // Do not add new nodes to DAG combiner worklist.
31109     DCI.CombineTo(N, NewMul, false);
31110
31111   return SDValue();
31112 }
31113
31114 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31115   SDValue N0 = N->getOperand(0);
31116   SDValue N1 = N->getOperand(1);
31117   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31118   EVT VT = N0.getValueType();
31119
31120   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31121   // since the result of setcc_c is all zero's or all ones.
31122   if (VT.isInteger() && !VT.isVector() &&
31123       N1C && N0.getOpcode() == ISD::AND &&
31124       N0.getOperand(1).getOpcode() == ISD::Constant) {
31125     SDValue N00 = N0.getOperand(0);
31126     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31127     Mask <<= N1C->getAPIntValue();
31128     bool MaskOK = false;
31129     // We can handle cases concerning bit-widening nodes containing setcc_c if
31130     // we carefully interrogate the mask to make sure we are semantics
31131     // preserving.
31132     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31133     // of the underlying setcc_c operation if the setcc_c was zero extended.
31134     // Consider the following example:
31135     //   zext(setcc_c)                 -> i32 0x0000FFFF
31136     //   c1                            -> i32 0x0000FFFF
31137     //   c2                            -> i32 0x00000001
31138     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31139     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
31140     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31141       MaskOK = true;
31142     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31143                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31144       MaskOK = true;
31145     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31146                 N00.getOpcode() == ISD::ANY_EXTEND) &&
31147                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31148       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31149     }
31150     if (MaskOK && Mask != 0) {
31151       SDLoc DL(N);
31152       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31153     }
31154   }
31155
31156   // Hardware support for vector shifts is sparse which makes us scalarize the
31157   // vector operations in many cases. Also, on sandybridge ADD is faster than
31158   // shl.
31159   // (shl V, 1) -> add V,V
31160   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31161     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31162       assert(N0.getValueType().isVector() && "Invalid vector shift type");
31163       // We shift all of the values by one. In many cases we do not have
31164       // hardware support for this operation. This is better expressed as an ADD
31165       // of two values.
31166       if (N1SplatC->getAPIntValue() == 1)
31167         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31168     }
31169
31170   return SDValue();
31171 }
31172
31173 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31174   SDValue N0 = N->getOperand(0);
31175   SDValue N1 = N->getOperand(1);
31176   EVT VT = N0.getValueType();
31177   unsigned Size = VT.getSizeInBits();
31178
31179   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31180   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31181   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31182   // depending on sign of (SarConst - [56,48,32,24,16])
31183
31184   // sexts in X86 are MOVs. The MOVs have the same code size
31185   // as above SHIFTs (only SHIFT on 1 has lower code size).
31186   // However the MOVs have 2 advantages to a SHIFT:
31187   // 1. MOVs can write to a register that differs from source
31188   // 2. MOVs accept memory operands
31189
31190   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31191       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31192       N0.getOperand(1).getOpcode() != ISD::Constant)
31193     return SDValue();
31194
31195   SDValue N00 = N0.getOperand(0);
31196   SDValue N01 = N0.getOperand(1);
31197   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31198   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31199   EVT CVT = N1.getValueType();
31200
31201   if (SarConst.isNegative())
31202     return SDValue();
31203
31204   for (MVT SVT : MVT::integer_valuetypes()) {
31205     unsigned ShiftSize = SVT.getSizeInBits();
31206     // skipping types without corresponding sext/zext and
31207     // ShlConst that is not one of [56,48,32,24,16]
31208     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31209       continue;
31210     SDLoc DL(N);
31211     SDValue NN =
31212         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31213     SarConst = SarConst - (Size - ShiftSize);
31214     if (SarConst == 0)
31215       return NN;
31216     else if (SarConst.isNegative())
31217       return DAG.getNode(ISD::SHL, DL, VT, NN,
31218                          DAG.getConstant(-SarConst, DL, CVT));
31219     else
31220       return DAG.getNode(ISD::SRA, DL, VT, NN,
31221                          DAG.getConstant(SarConst, DL, CVT));
31222   }
31223   return SDValue();
31224 }
31225
31226 /// \brief Returns a vector of 0s if the node in input is a vector logical
31227 /// shift by a constant amount which is known to be bigger than or equal
31228 /// to the vector element size in bits.
31229 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31230                                       const X86Subtarget &Subtarget) {
31231   EVT VT = N->getValueType(0);
31232
31233   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31234       (!Subtarget.hasInt256() ||
31235        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31236     return SDValue();
31237
31238   SDValue Amt = N->getOperand(1);
31239   SDLoc DL(N);
31240   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31241     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31242       const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31243       unsigned MaxAmount =
31244         VT.getSimpleVT().getScalarSizeInBits();
31245
31246       // SSE2/AVX2 logical shifts always return a vector of 0s
31247       // if the shift amount is bigger than or equal to
31248       // the element size. The constant shift amount will be
31249       // encoded as a 8-bit immediate.
31250       if (ShiftAmt.trunc(8).uge(MaxAmount))
31251         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31252     }
31253
31254   return SDValue();
31255 }
31256
31257 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31258                             TargetLowering::DAGCombinerInfo &DCI,
31259                             const X86Subtarget &Subtarget) {
31260   if (N->getOpcode() == ISD::SHL)
31261     if (SDValue V = combineShiftLeft(N, DAG))
31262       return V;
31263
31264   if (N->getOpcode() == ISD::SRA)
31265     if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31266       return V;
31267
31268   // Try to fold this logical shift into a zero vector.
31269   if (N->getOpcode() != ISD::SRA)
31270     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31271       return V;
31272
31273   return SDValue();
31274 }
31275
31276 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31277                                      TargetLowering::DAGCombinerInfo &DCI,
31278                                      const X86Subtarget &Subtarget) {
31279   unsigned Opcode = N->getOpcode();
31280   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31281           X86ISD::VSRLI == Opcode) &&
31282          "Unexpected shift opcode");
31283   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31284   EVT VT = N->getValueType(0);
31285   SDValue N0 = N->getOperand(0);
31286   SDValue N1 = N->getOperand(1);
31287   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31288   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31289          "Unexpected value type");
31290
31291   // Out of range logical bit shifts are guaranteed to be zero.
31292   // Out of range arithmetic bit shifts splat the sign bit.
31293   APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31294   if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31295     if (LogicalShift)
31296       return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31297     else
31298       ShiftVal = NumBitsPerElt - 1;
31299   }
31300
31301   // Shift N0 by zero -> N0.
31302   if (!ShiftVal)
31303     return N0;
31304
31305   // Shift zero -> zero.
31306   if (ISD::isBuildVectorAllZeros(N0.getNode()))
31307     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31308
31309   // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31310   // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31311   // TODO - support other sra opcodes as needed.
31312   if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31313       N0.getOpcode() == X86ISD::VSRAI)
31314     return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31315
31316   // We can decode 'whole byte' logical bit shifts as shuffles.
31317   if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31318     SDValue Op(N, 0);
31319     SmallVector<int, 1> NonceMask; // Just a placeholder.
31320     NonceMask.push_back(0);
31321     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31322                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31323                                       DCI, Subtarget))
31324       return SDValue(); // This routine will use CombineTo to replace N.
31325   }
31326
31327   // Constant Folding.
31328   APInt UndefElts;
31329   SmallVector<APInt, 32> EltBits;
31330   if (N->isOnlyUserOf(N0.getNode()) &&
31331       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31332     assert(EltBits.size() == VT.getVectorNumElements() &&
31333            "Unexpected shift value type");
31334     unsigned ShiftImm = ShiftVal.getZExtValue();
31335     for (APInt &Elt : EltBits) {
31336       if (X86ISD::VSHLI == Opcode)
31337         Elt <<= ShiftImm;
31338       else if (X86ISD::VSRAI == Opcode)
31339         Elt.ashrInPlace(ShiftImm);
31340       else
31341         Elt.lshrInPlace(ShiftImm);
31342     }
31343     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31344   }
31345
31346   return SDValue();
31347 }
31348
31349 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31350                                    TargetLowering::DAGCombinerInfo &DCI,
31351                                    const X86Subtarget &Subtarget) {
31352   assert(
31353       ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31354        (N->getOpcode() == X86ISD::PINSRW &&
31355         N->getValueType(0) == MVT::v8i16)) &&
31356       "Unexpected vector insertion");
31357
31358   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31359   SDValue Op(N, 0);
31360   SmallVector<int, 1> NonceMask; // Just a placeholder.
31361   NonceMask.push_back(0);
31362   combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31363                                 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31364                                 DCI, Subtarget);
31365   return SDValue();
31366 }
31367
31368 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31369 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31370 /// OR -> CMPNEQSS.
31371 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31372                                    TargetLowering::DAGCombinerInfo &DCI,
31373                                    const X86Subtarget &Subtarget) {
31374   unsigned opcode;
31375
31376   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31377   // we're requiring SSE2 for both.
31378   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31379     SDValue N0 = N->getOperand(0);
31380     SDValue N1 = N->getOperand(1);
31381     SDValue CMP0 = N0->getOperand(1);
31382     SDValue CMP1 = N1->getOperand(1);
31383     SDLoc DL(N);
31384
31385     // The SETCCs should both refer to the same CMP.
31386     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31387       return SDValue();
31388
31389     SDValue CMP00 = CMP0->getOperand(0);
31390     SDValue CMP01 = CMP0->getOperand(1);
31391     EVT     VT    = CMP00.getValueType();
31392
31393     if (VT == MVT::f32 || VT == MVT::f64) {
31394       bool ExpectingFlags = false;
31395       // Check for any users that want flags:
31396       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31397            !ExpectingFlags && UI != UE; ++UI)
31398         switch (UI->getOpcode()) {
31399         default:
31400         case ISD::BR_CC:
31401         case ISD::BRCOND:
31402         case ISD::SELECT:
31403           ExpectingFlags = true;
31404           break;
31405         case ISD::CopyToReg:
31406         case ISD::SIGN_EXTEND:
31407         case ISD::ZERO_EXTEND:
31408         case ISD::ANY_EXTEND:
31409           break;
31410         }
31411
31412       if (!ExpectingFlags) {
31413         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31414         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31415
31416         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31417           X86::CondCode tmp = cc0;
31418           cc0 = cc1;
31419           cc1 = tmp;
31420         }
31421
31422         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
31423             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31424           // FIXME: need symbolic constants for these magic numbers.
31425           // See X86ATTInstPrinter.cpp:printSSECC().
31426           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31427           if (Subtarget.hasAVX512()) {
31428             SDValue FSetCC =
31429                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31430                             DAG.getConstant(x86cc, DL, MVT::i8));
31431             return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31432                                FSetCC, DAG.getIntPtrConstant(0, DL));
31433           }
31434           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31435                                               CMP00.getValueType(), CMP00, CMP01,
31436                                               DAG.getConstant(x86cc, DL,
31437                                                               MVT::i8));
31438
31439           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31440           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31441
31442           if (is64BitFP && !Subtarget.is64Bit()) {
31443             // On a 32-bit target, we cannot bitcast the 64-bit float to a
31444             // 64-bit integer, since that's not a legal type. Since
31445             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31446             // bits, but can do this little dance to extract the lowest 32 bits
31447             // and work with those going forward.
31448             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31449                                            OnesOrZeroesF);
31450             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31451             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31452                                         Vector32, DAG.getIntPtrConstant(0, DL));
31453             IntVT = MVT::i32;
31454           }
31455
31456           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31457           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31458                                       DAG.getConstant(1, DL, IntVT));
31459           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31460                                               ANDed);
31461           return OneBitOfTruth;
31462         }
31463       }
31464     }
31465   }
31466   return SDValue();
31467 }
31468
31469 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31470 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31471   assert(N->getOpcode() == ISD::AND);
31472
31473   EVT VT = N->getValueType(0);
31474   SDValue N0 = N->getOperand(0);
31475   SDValue N1 = N->getOperand(1);
31476   SDLoc DL(N);
31477
31478   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31479     return SDValue();
31480
31481   if (N0.getOpcode() == ISD::XOR &&
31482       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31483     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31484
31485   if (N1.getOpcode() == ISD::XOR &&
31486       ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31487     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31488
31489   return SDValue();
31490 }
31491
31492 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31493 // register. In most cases we actually compare or select YMM-sized registers
31494 // and mixing the two types creates horrible code. This method optimizes
31495 // some of the transition sequences.
31496 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31497                                  TargetLowering::DAGCombinerInfo &DCI,
31498                                  const X86Subtarget &Subtarget) {
31499   EVT VT = N->getValueType(0);
31500   if (!VT.is256BitVector())
31501     return SDValue();
31502
31503   assert((N->getOpcode() == ISD::ANY_EXTEND ||
31504           N->getOpcode() == ISD::ZERO_EXTEND ||
31505           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31506
31507   SDValue Narrow = N->getOperand(0);
31508   EVT NarrowVT = Narrow->getValueType(0);
31509   if (!NarrowVT.is128BitVector())
31510     return SDValue();
31511
31512   if (Narrow->getOpcode() != ISD::XOR &&
31513       Narrow->getOpcode() != ISD::AND &&
31514       Narrow->getOpcode() != ISD::OR)
31515     return SDValue();
31516
31517   SDValue N0  = Narrow->getOperand(0);
31518   SDValue N1  = Narrow->getOperand(1);
31519   SDLoc DL(Narrow);
31520
31521   // The Left side has to be a trunc.
31522   if (N0.getOpcode() != ISD::TRUNCATE)
31523     return SDValue();
31524
31525   // The type of the truncated inputs.
31526   EVT WideVT = N0->getOperand(0)->getValueType(0);
31527   if (WideVT != VT)
31528     return SDValue();
31529
31530   // The right side has to be a 'trunc' or a constant vector.
31531   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31532   ConstantSDNode *RHSConstSplat = nullptr;
31533   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31534     RHSConstSplat = RHSBV->getConstantSplatNode();
31535   if (!RHSTrunc && !RHSConstSplat)
31536     return SDValue();
31537
31538   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31539
31540   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31541     return SDValue();
31542
31543   // Set N0 and N1 to hold the inputs to the new wide operation.
31544   N0 = N0->getOperand(0);
31545   if (RHSConstSplat) {
31546     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31547                      SDValue(RHSConstSplat, 0));
31548     N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31549   } else if (RHSTrunc) {
31550     N1 = N1->getOperand(0);
31551   }
31552
31553   // Generate the wide operation.
31554   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31555   unsigned Opcode = N->getOpcode();
31556   switch (Opcode) {
31557   case ISD::ANY_EXTEND:
31558     return Op;
31559   case ISD::ZERO_EXTEND: {
31560     unsigned InBits = NarrowVT.getScalarSizeInBits();
31561     APInt Mask = APInt::getAllOnesValue(InBits);
31562     Mask = Mask.zext(VT.getScalarSizeInBits());
31563     return DAG.getNode(ISD::AND, DL, VT,
31564                        Op, DAG.getConstant(Mask, DL, VT));
31565   }
31566   case ISD::SIGN_EXTEND:
31567     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31568                        Op, DAG.getValueType(NarrowVT));
31569   default:
31570     llvm_unreachable("Unexpected opcode");
31571   }
31572 }
31573
31574 /// If both input operands of a logic op are being cast from floating point
31575 /// types, try to convert this into a floating point logic node to avoid
31576 /// unnecessary moves from SSE to integer registers.
31577 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31578                                         const X86Subtarget &Subtarget) {
31579   unsigned FPOpcode = ISD::DELETED_NODE;
31580   if (N->getOpcode() == ISD::AND)
31581     FPOpcode = X86ISD::FAND;
31582   else if (N->getOpcode() == ISD::OR)
31583     FPOpcode = X86ISD::FOR;
31584   else if (N->getOpcode() == ISD::XOR)
31585     FPOpcode = X86ISD::FXOR;
31586
31587   assert(FPOpcode != ISD::DELETED_NODE &&
31588          "Unexpected input node for FP logic conversion");
31589
31590   EVT VT = N->getValueType(0);
31591   SDValue N0 = N->getOperand(0);
31592   SDValue N1 = N->getOperand(1);
31593   SDLoc DL(N);
31594   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
31595       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
31596        (Subtarget.hasSSE2() && VT == MVT::i64))) {
31597     SDValue N00 = N0.getOperand(0);
31598     SDValue N10 = N1.getOperand(0);
31599     EVT N00Type = N00.getValueType();
31600     EVT N10Type = N10.getValueType();
31601     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
31602       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
31603       return DAG.getBitcast(VT, FPLogic);
31604     }
31605   }
31606   return SDValue();
31607 }
31608
31609 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
31610 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
31611 /// with a shift-right to eliminate loading the vector constant mask value.
31612 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
31613                                      const X86Subtarget &Subtarget) {
31614   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
31615   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
31616   EVT VT0 = Op0.getValueType();
31617   EVT VT1 = Op1.getValueType();
31618
31619   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
31620     return SDValue();
31621
31622   APInt SplatVal;
31623   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
31624       !SplatVal.isMask())
31625     return SDValue();
31626
31627   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
31628     return SDValue();
31629
31630   unsigned EltBitWidth = VT0.getScalarSizeInBits();
31631   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
31632     return SDValue();
31633
31634   SDLoc DL(N);
31635   unsigned ShiftVal = SplatVal.countTrailingOnes();
31636   SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
31637   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
31638   return DAG.getBitcast(N->getValueType(0), Shift);
31639 }
31640
31641 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
31642                           TargetLowering::DAGCombinerInfo &DCI,
31643                           const X86Subtarget &Subtarget) {
31644   if (DCI.isBeforeLegalizeOps())
31645     return SDValue();
31646
31647   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31648     return R;
31649
31650   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31651     return FPLogic;
31652
31653   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
31654     return R;
31655
31656   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
31657     return ShiftRight;
31658
31659   EVT VT = N->getValueType(0);
31660   SDValue N0 = N->getOperand(0);
31661   SDValue N1 = N->getOperand(1);
31662   SDLoc DL(N);
31663
31664   // Attempt to recursively combine a bitmask AND with shuffles.
31665   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
31666     SDValue Op(N, 0);
31667     SmallVector<int, 1> NonceMask; // Just a placeholder.
31668     NonceMask.push_back(0);
31669     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31670                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31671                                       DCI, Subtarget))
31672       return SDValue(); // This routine will use CombineTo to replace N.
31673   }
31674
31675   // Create BEXTR instructions
31676   // BEXTR is ((X >> imm) & (2**size-1))
31677   if (VT != MVT::i32 && VT != MVT::i64)
31678     return SDValue();
31679
31680   if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
31681     return SDValue();
31682   if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
31683     return SDValue();
31684
31685   ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
31686   ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
31687   if (MaskNode && ShiftNode) {
31688     uint64_t Mask = MaskNode->getZExtValue();
31689     uint64_t Shift = ShiftNode->getZExtValue();
31690     if (isMask_64(Mask)) {
31691       uint64_t MaskSize = countPopulation(Mask);
31692       if (Shift + MaskSize <= VT.getSizeInBits())
31693         return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
31694                            DAG.getConstant(Shift | (MaskSize << 8), DL,
31695                                            VT));
31696     }
31697   }
31698   return SDValue();
31699 }
31700
31701 // Try to fold:
31702 //   (or (and (m, y), (pandn m, x)))
31703 // into:
31704 //   (vselect m, x, y)
31705 // As a special case, try to fold:
31706 //   (or (and (m, (sub 0, x)), (pandn m, x)))
31707 // into:
31708 //   (sub (xor X, M), M)
31709 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
31710                                             const X86Subtarget &Subtarget) {
31711   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
31712
31713   SDValue N0 = N->getOperand(0);
31714   SDValue N1 = N->getOperand(1);
31715   EVT VT = N->getValueType(0);
31716
31717   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
31718         (VT.is256BitVector() && Subtarget.hasInt256())))
31719     return SDValue();
31720
31721   // Canonicalize AND to LHS.
31722   if (N1.getOpcode() == ISD::AND)
31723     std::swap(N0, N1);
31724
31725   // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
31726   // ANDNP combine allows other combines to happen that prevent matching.
31727   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
31728     return SDValue();
31729
31730   SDValue Mask = N1.getOperand(0);
31731   SDValue X = N1.getOperand(1);
31732   SDValue Y;
31733   if (N0.getOperand(0) == Mask)
31734     Y = N0.getOperand(1);
31735   if (N0.getOperand(1) == Mask)
31736     Y = N0.getOperand(0);
31737
31738   // Check to see if the mask appeared in both the AND and ANDNP.
31739   if (!Y.getNode())
31740     return SDValue();
31741
31742   // Validate that X, Y, and Mask are bitcasts, and see through them.
31743   Mask = peekThroughBitcasts(Mask);
31744   X = peekThroughBitcasts(X);
31745   Y = peekThroughBitcasts(Y);
31746
31747   EVT MaskVT = Mask.getValueType();
31748   unsigned EltBits = MaskVT.getScalarSizeInBits();
31749
31750   // TODO: Attempt to handle floating point cases as well?
31751   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
31752     return SDValue();
31753
31754   SDLoc DL(N);
31755
31756   // Try to match:
31757   //   (or (and (M, (sub 0, X)), (pandn M, X)))
31758   // which is a special case of vselect:
31759   //   (vselect M, (sub 0, X), X)
31760   // Per:
31761   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
31762   // We know that, if fNegate is 0 or 1:
31763   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
31764   //
31765   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
31766   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
31767   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
31768   // This lets us transform our vselect to:
31769   //   (add (xor X, M), (and M, 1))
31770   // And further to:
31771   //   (sub (xor X, M), M)
31772   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
31773       DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
31774     auto IsNegV = [](SDNode *N, SDValue V) {
31775       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
31776         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
31777     };
31778     SDValue V;
31779     if (IsNegV(Y.getNode(), X))
31780       V = X;
31781     else if (IsNegV(X.getNode(), Y))
31782       V = Y;
31783
31784     if (V) {
31785       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
31786       SDValue SubOp2 = Mask;
31787
31788       // If the negate was on the false side of the select, then
31789       // the operands of the SUB need to be swapped. PR 27251.
31790       // This is because the pattern being matched above is
31791       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
31792       // but if the pattern matched was
31793       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
31794       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
31795       // pattern also needs to be a negation of the replacement pattern above.
31796       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
31797       // sub accomplishes the negation of the replacement pattern.
31798       if (V == Y)
31799          std::swap(SubOp1, SubOp2);
31800
31801       SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
31802       return DAG.getBitcast(VT, Res);
31803     }
31804   }
31805
31806   // PBLENDVB is only available on SSE 4.1.
31807   if (!Subtarget.hasSSE41())
31808     return SDValue();
31809
31810   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
31811
31812   X = DAG.getBitcast(BlendVT, X);
31813   Y = DAG.getBitcast(BlendVT, Y);
31814   Mask = DAG.getBitcast(BlendVT, Mask);
31815   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
31816   return DAG.getBitcast(VT, Mask);
31817 }
31818
31819 // Helper function for combineOrCmpEqZeroToCtlzSrl
31820 // Transforms:
31821 //   seteq(cmp x, 0)
31822 //   into:
31823 //   srl(ctlz x), log2(bitsize(x))
31824 // Input pattern is checked by caller.
31825 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
31826                                           SelectionDAG &DAG) {
31827   SDValue Cmp = Op.getOperand(1);
31828   EVT VT = Cmp.getOperand(0).getValueType();
31829   unsigned Log2b = Log2_32(VT.getSizeInBits());
31830   SDLoc dl(Op);
31831   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
31832   // The result of the shift is true or false, and on X86, the 32-bit
31833   // encoding of shr and lzcnt is more desirable.
31834   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
31835   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
31836                             DAG.getConstant(Log2b, dl, VT));
31837   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
31838 }
31839
31840 // Try to transform:
31841 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
31842 //   into:
31843 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
31844 // Will also attempt to match more generic cases, eg:
31845 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
31846 // Only applies if the target supports the FastLZCNT feature.
31847 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
31848                                            TargetLowering::DAGCombinerInfo &DCI,
31849                                            const X86Subtarget &Subtarget) {
31850   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
31851     return SDValue();
31852
31853   auto isORCandidate = [](SDValue N) {
31854     return (N->getOpcode() == ISD::OR && N->hasOneUse());
31855   };
31856
31857   // Check the zero extend is extending to 32-bit or more. The code generated by
31858   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
31859   // instructions to clear the upper bits.
31860   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
31861       !isORCandidate(N->getOperand(0)))
31862     return SDValue();
31863
31864   // Check the node matches: setcc(eq, cmp 0)
31865   auto isSetCCCandidate = [](SDValue N) {
31866     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
31867            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
31868            N->getOperand(1).getOpcode() == X86ISD::CMP &&
31869            isNullConstant(N->getOperand(1).getOperand(1)) &&
31870            N->getOperand(1).getValueType().bitsGE(MVT::i32);
31871   };
31872
31873   SDNode *OR = N->getOperand(0).getNode();
31874   SDValue LHS = OR->getOperand(0);
31875   SDValue RHS = OR->getOperand(1);
31876
31877   // Save nodes matching or(or, setcc(eq, cmp 0)).
31878   SmallVector<SDNode *, 2> ORNodes;
31879   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
31880           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
31881     ORNodes.push_back(OR);
31882     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
31883     LHS = OR->getOperand(0);
31884     RHS = OR->getOperand(1);
31885   }
31886
31887   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
31888   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
31889       !isORCandidate(SDValue(OR, 0)))
31890     return SDValue();
31891
31892   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
31893   // to
31894   // or(srl(ctlz),srl(ctlz)).
31895   // The dag combiner can then fold it into:
31896   // srl(or(ctlz, ctlz)).
31897   EVT VT = OR->getValueType(0);
31898   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31899   SDValue Ret, NewRHS;
31900   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31901     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31902
31903   if (!Ret)
31904     return SDValue();
31905
31906   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31907   while (ORNodes.size() > 0) {
31908     OR = ORNodes.pop_back_val();
31909     LHS = OR->getOperand(0);
31910     RHS = OR->getOperand(1);
31911     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31912     if (RHS->getOpcode() == ISD::OR)
31913       std::swap(LHS, RHS);
31914     EVT VT = OR->getValueType(0);
31915     SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31916     if (!NewRHS)
31917       return SDValue();
31918     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31919   }
31920
31921   if (Ret)
31922     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31923
31924   return Ret;
31925 }
31926
31927 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31928                          TargetLowering::DAGCombinerInfo &DCI,
31929                          const X86Subtarget &Subtarget) {
31930   if (DCI.isBeforeLegalizeOps())
31931     return SDValue();
31932
31933   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31934     return R;
31935
31936   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31937     return FPLogic;
31938
31939   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31940     return R;
31941
31942   SDValue N0 = N->getOperand(0);
31943   SDValue N1 = N->getOperand(1);
31944   EVT VT = N->getValueType(0);
31945
31946   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31947     return SDValue();
31948
31949   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31950   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31951
31952   // SHLD/SHRD instructions have lower register pressure, but on some
31953   // platforms they have higher latency than the equivalent
31954   // series of shifts/or that would otherwise be generated.
31955   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31956   // have higher latencies and we are not optimizing for size.
31957   if (!OptForSize && Subtarget.isSHLDSlow())
31958     return SDValue();
31959
31960   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31961     std::swap(N0, N1);
31962   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31963     return SDValue();
31964   if (!N0.hasOneUse() || !N1.hasOneUse())
31965     return SDValue();
31966
31967   SDValue ShAmt0 = N0.getOperand(1);
31968   if (ShAmt0.getValueType() != MVT::i8)
31969     return SDValue();
31970   SDValue ShAmt1 = N1.getOperand(1);
31971   if (ShAmt1.getValueType() != MVT::i8)
31972     return SDValue();
31973   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31974     ShAmt0 = ShAmt0.getOperand(0);
31975   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31976     ShAmt1 = ShAmt1.getOperand(0);
31977
31978   SDLoc DL(N);
31979   unsigned Opc = X86ISD::SHLD;
31980   SDValue Op0 = N0.getOperand(0);
31981   SDValue Op1 = N1.getOperand(0);
31982   if (ShAmt0.getOpcode() == ISD::SUB ||
31983       ShAmt0.getOpcode() == ISD::XOR) {
31984     Opc = X86ISD::SHRD;
31985     std::swap(Op0, Op1);
31986     std::swap(ShAmt0, ShAmt1);
31987   }
31988
31989   // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31990   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31991   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31992   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31993   unsigned Bits = VT.getSizeInBits();
31994   if (ShAmt1.getOpcode() == ISD::SUB) {
31995     SDValue Sum = ShAmt1.getOperand(0);
31996     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31997       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31998       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31999         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32000       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32001         return DAG.getNode(Opc, DL, VT,
32002                            Op0, Op1,
32003                            DAG.getNode(ISD::TRUNCATE, DL,
32004                                        MVT::i8, ShAmt0));
32005     }
32006   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32007     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32008     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32009       return DAG.getNode(Opc, DL, VT,
32010                          N0.getOperand(0), N1.getOperand(0),
32011                          DAG.getNode(ISD::TRUNCATE, DL,
32012                                        MVT::i8, ShAmt0));
32013   } else if (ShAmt1.getOpcode() == ISD::XOR) {
32014     SDValue Mask = ShAmt1.getOperand(1);
32015     if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32016       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32017       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32018       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32019         ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32020       if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32021         if (Op1.getOpcode() == InnerShift &&
32022             isa<ConstantSDNode>(Op1.getOperand(1)) &&
32023             Op1.getConstantOperandVal(1) == 1) {
32024           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32025                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32026         }
32027         // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32028         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32029             Op1.getOperand(0) == Op1.getOperand(1)) {
32030           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32031                      DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32032         }
32033       }
32034     }
32035   }
32036
32037   return SDValue();
32038 }
32039
32040 /// Generate NEG and CMOV for integer abs.
32041 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32042   EVT VT = N->getValueType(0);
32043
32044   // Since X86 does not have CMOV for 8-bit integer, we don't convert
32045   // 8-bit integer abs to NEG and CMOV.
32046   if (VT.isInteger() && VT.getSizeInBits() == 8)
32047     return SDValue();
32048
32049   SDValue N0 = N->getOperand(0);
32050   SDValue N1 = N->getOperand(1);
32051   SDLoc DL(N);
32052
32053   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32054   // and change it to SUB and CMOV.
32055   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32056       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32057       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32058     auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32059     if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32060       // Generate SUB & CMOV.
32061       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32062                                 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32063       SDValue Ops[] = {N0.getOperand(0), Neg,
32064                        DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32065                        SDValue(Neg.getNode(), 1)};
32066       return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32067     }
32068   }
32069   return SDValue();
32070 }
32071
32072 /// Try to turn tests against the signbit in the form of:
32073 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32074 /// into:
32075 ///   SETGT(X, -1)
32076 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32077   // This is only worth doing if the output type is i8 or i1.
32078   EVT ResultType = N->getValueType(0);
32079   if (ResultType != MVT::i8 && ResultType != MVT::i1)
32080     return SDValue();
32081
32082   SDValue N0 = N->getOperand(0);
32083   SDValue N1 = N->getOperand(1);
32084
32085   // We should be performing an xor against a truncated shift.
32086   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32087     return SDValue();
32088
32089   // Make sure we are performing an xor against one.
32090   if (!isOneConstant(N1))
32091     return SDValue();
32092
32093   // SetCC on x86 zero extends so only act on this if it's a logical shift.
32094   SDValue Shift = N0.getOperand(0);
32095   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32096     return SDValue();
32097
32098   // Make sure we are truncating from one of i16, i32 or i64.
32099   EVT ShiftTy = Shift.getValueType();
32100   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32101     return SDValue();
32102
32103   // Make sure the shift amount extracts the sign bit.
32104   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32105       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32106     return SDValue();
32107
32108   // Create a greater-than comparison against -1.
32109   // N.B. Using SETGE against 0 works but we want a canonical looking
32110   // comparison, using SETGT matches up with what TranslateX86CC.
32111   SDLoc DL(N);
32112   SDValue ShiftOp = Shift.getOperand(0);
32113   EVT ShiftOpTy = ShiftOp.getValueType();
32114   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32115   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32116                                                *DAG.getContext(), ResultType);
32117   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32118                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32119   if (SetCCResultType != ResultType)
32120     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32121   return Cond;
32122 }
32123
32124 /// Turn vector tests of the signbit in the form of:
32125 ///   xor (sra X, elt_size(X)-1), -1
32126 /// into:
32127 ///   pcmpgt X, -1
32128 ///
32129 /// This should be called before type legalization because the pattern may not
32130 /// persist after that.
32131 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32132                                          const X86Subtarget &Subtarget) {
32133   EVT VT = N->getValueType(0);
32134   if (!VT.isSimple())
32135     return SDValue();
32136
32137   switch (VT.getSimpleVT().SimpleTy) {
32138   default: return SDValue();
32139   case MVT::v16i8:
32140   case MVT::v8i16:
32141   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32142   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32143   case MVT::v32i8:
32144   case MVT::v16i16:
32145   case MVT::v8i32:
32146   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32147   }
32148
32149   // There must be a shift right algebraic before the xor, and the xor must be a
32150   // 'not' operation.
32151   SDValue Shift = N->getOperand(0);
32152   SDValue Ones = N->getOperand(1);
32153   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32154       !ISD::isBuildVectorAllOnes(Ones.getNode()))
32155     return SDValue();
32156
32157   // The shift should be smearing the sign bit across each vector element.
32158   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32159   if (!ShiftBV)
32160     return SDValue();
32161
32162   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32163   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32164   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32165     return SDValue();
32166
32167   // Create a greater-than comparison against -1. We don't use the more obvious
32168   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32169   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32170 }
32171
32172 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32173 /// is valid for the given \p Subtarget.
32174 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32175                                         const X86Subtarget &Subtarget) {
32176   if (!Subtarget.hasAVX512())
32177     return false;
32178
32179   // FIXME: Scalar type may be supported if we move it to vector register.
32180   if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32181     return false;
32182
32183   EVT SrcElVT = SrcVT.getScalarType();
32184   EVT DstElVT = DstVT.getScalarType();
32185   if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32186     return false;
32187   if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32188     return false;
32189   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32190     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32191   return false;
32192 }
32193
32194 /// Detect a pattern of truncation with saturation:
32195 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32196 /// Return the source value to be truncated or SDValue() if the pattern was not
32197 /// matched.
32198 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32199   if (In.getOpcode() != ISD::UMIN)
32200     return SDValue();
32201
32202   //Saturation with truncation. We truncate from InVT to VT.
32203   assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32204     "Unexpected types for truncate operation");
32205
32206   APInt C;
32207   if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32208     // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32209     // the element size of the destination type.
32210     return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32211       SDValue();
32212   }
32213   return SDValue();
32214 }
32215
32216 /// Detect a pattern of truncation with saturation:
32217 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32218 /// The types should allow to use VPMOVUS* instruction on AVX512.
32219 /// Return the source value to be truncated or SDValue() if the pattern was not
32220 /// matched.
32221 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32222                                        const X86Subtarget &Subtarget) {
32223   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32224     return SDValue();
32225   return detectUSatPattern(In, VT);
32226 }
32227
32228 static SDValue
32229 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32230                         const X86Subtarget &Subtarget) {
32231   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32232   if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32233     return SDValue();
32234   if (auto USatVal = detectUSatPattern(In, VT))
32235     if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32236       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32237   return SDValue();
32238 }
32239
32240 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32241 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32242 /// X86ISD::AVG instruction.
32243 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32244                                 const X86Subtarget &Subtarget,
32245                                 const SDLoc &DL) {
32246   if (!VT.isVector() || !VT.isSimple())
32247     return SDValue();
32248   EVT InVT = In.getValueType();
32249   unsigned NumElems = VT.getVectorNumElements();
32250
32251   EVT ScalarVT = VT.getVectorElementType();
32252   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32253         isPowerOf2_32(NumElems)))
32254     return SDValue();
32255
32256   // InScalarVT is the intermediate type in AVG pattern and it should be greater
32257   // than the original input type (i8/i16).
32258   EVT InScalarVT = InVT.getVectorElementType();
32259   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32260     return SDValue();
32261
32262   if (!Subtarget.hasSSE2())
32263     return SDValue();
32264   if (Subtarget.hasBWI()) {
32265     if (VT.getSizeInBits() > 512)
32266       return SDValue();
32267   } else if (Subtarget.hasAVX2()) {
32268     if (VT.getSizeInBits() > 256)
32269       return SDValue();
32270   } else {
32271     if (VT.getSizeInBits() > 128)
32272       return SDValue();
32273   }
32274
32275   // Detect the following pattern:
32276   //
32277   //   %1 = zext <N x i8> %a to <N x i32>
32278   //   %2 = zext <N x i8> %b to <N x i32>
32279   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32280   //   %4 = add nuw nsw <N x i32> %3, %2
32281   //   %5 = lshr <N x i32> %N, <i32 1 x N>
32282   //   %6 = trunc <N x i32> %5 to <N x i8>
32283   //
32284   // In AVX512, the last instruction can also be a trunc store.
32285
32286   if (In.getOpcode() != ISD::SRL)
32287     return SDValue();
32288
32289   // A lambda checking the given SDValue is a constant vector and each element
32290   // is in the range [Min, Max].
32291   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32292     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32293     if (!BV || !BV->isConstant())
32294       return false;
32295     for (SDValue Op : V->ops()) {
32296       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32297       if (!C)
32298         return false;
32299       uint64_t Val = C->getZExtValue();
32300       if (Val < Min || Val > Max)
32301         return false;
32302     }
32303     return true;
32304   };
32305
32306   // Check if each element of the vector is left-shifted by one.
32307   auto LHS = In.getOperand(0);
32308   auto RHS = In.getOperand(1);
32309   if (!IsConstVectorInRange(RHS, 1, 1))
32310     return SDValue();
32311   if (LHS.getOpcode() != ISD::ADD)
32312     return SDValue();
32313
32314   // Detect a pattern of a + b + 1 where the order doesn't matter.
32315   SDValue Operands[3];
32316   Operands[0] = LHS.getOperand(0);
32317   Operands[1] = LHS.getOperand(1);
32318
32319   // Take care of the case when one of the operands is a constant vector whose
32320   // element is in the range [1, 256].
32321   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32322       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32323       Operands[0].getOperand(0).getValueType() == VT) {
32324     // The pattern is detected. Subtract one from the constant vector, then
32325     // demote it and emit X86ISD::AVG instruction.
32326     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32327     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32328     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32329     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32330                        Operands[1]);
32331   }
32332
32333   if (Operands[0].getOpcode() == ISD::ADD)
32334     std::swap(Operands[0], Operands[1]);
32335   else if (Operands[1].getOpcode() != ISD::ADD)
32336     return SDValue();
32337   Operands[2] = Operands[1].getOperand(0);
32338   Operands[1] = Operands[1].getOperand(1);
32339
32340   // Now we have three operands of two additions. Check that one of them is a
32341   // constant vector with ones, and the other two are promoted from i8/i16.
32342   for (int i = 0; i < 3; ++i) {
32343     if (!IsConstVectorInRange(Operands[i], 1, 1))
32344       continue;
32345     std::swap(Operands[i], Operands[2]);
32346
32347     // Check if Operands[0] and Operands[1] are results of type promotion.
32348     for (int j = 0; j < 2; ++j)
32349       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32350           Operands[j].getOperand(0).getValueType() != VT)
32351         return SDValue();
32352
32353     // The pattern is detected, emit X86ISD::AVG instruction.
32354     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32355                        Operands[1].getOperand(0));
32356   }
32357
32358   return SDValue();
32359 }
32360
32361 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32362                            TargetLowering::DAGCombinerInfo &DCI,
32363                            const X86Subtarget &Subtarget) {
32364   LoadSDNode *Ld = cast<LoadSDNode>(N);
32365   EVT RegVT = Ld->getValueType(0);
32366   EVT MemVT = Ld->getMemoryVT();
32367   SDLoc dl(Ld);
32368   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32369
32370   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32371   // into two 16-byte operations.
32372   ISD::LoadExtType Ext = Ld->getExtensionType();
32373   bool Fast;
32374   unsigned AddressSpace = Ld->getAddressSpace();
32375   unsigned Alignment = Ld->getAlignment();
32376   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32377       Ext == ISD::NON_EXTLOAD &&
32378       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32379                              AddressSpace, Alignment, &Fast) && !Fast) {
32380     unsigned NumElems = RegVT.getVectorNumElements();
32381     if (NumElems < 2)
32382       return SDValue();
32383
32384     SDValue Ptr = Ld->getBasePtr();
32385
32386     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32387                                   NumElems/2);
32388     SDValue Load1 =
32389         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32390                     Alignment, Ld->getMemOperand()->getFlags());
32391
32392     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32393     SDValue Load2 =
32394         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32395                     std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32396     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32397                              Load1.getValue(1),
32398                              Load2.getValue(1));
32399
32400     SDValue NewVec = DAG.getUNDEF(RegVT);
32401     NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32402     NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32403     return DCI.CombineTo(N, NewVec, TF, true);
32404   }
32405
32406   return SDValue();
32407 }
32408
32409 /// If V is a build vector of boolean constants and exactly one of those
32410 /// constants is true, return the operand index of that true element.
32411 /// Otherwise, return -1.
32412 static int getOneTrueElt(SDValue V) {
32413   // This needs to be a build vector of booleans.
32414   // TODO: Checking for the i1 type matches the IR definition for the mask,
32415   // but the mask check could be loosened to i8 or other types. That might
32416   // also require checking more than 'allOnesValue'; eg, the x86 HW
32417   // instructions only require that the MSB is set for each mask element.
32418   // The ISD::MSTORE comments/definition do not specify how the mask operand
32419   // is formatted.
32420   auto *BV = dyn_cast<BuildVectorSDNode>(V);
32421   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32422     return -1;
32423
32424   int TrueIndex = -1;
32425   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32426   for (unsigned i = 0; i < NumElts; ++i) {
32427     const SDValue &Op = BV->getOperand(i);
32428     if (Op.isUndef())
32429       continue;
32430     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32431     if (!ConstNode)
32432       return -1;
32433     if (ConstNode->getAPIntValue().isAllOnesValue()) {
32434       // If we already found a one, this is too many.
32435       if (TrueIndex >= 0)
32436         return -1;
32437       TrueIndex = i;
32438     }
32439   }
32440   return TrueIndex;
32441 }
32442
32443 /// Given a masked memory load/store operation, return true if it has one mask
32444 /// bit set. If it has one mask bit set, then also return the memory address of
32445 /// the scalar element to load/store, the vector index to insert/extract that
32446 /// scalar element, and the alignment for the scalar memory access.
32447 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32448                                          SelectionDAG &DAG, SDValue &Addr,
32449                                          SDValue &Index, unsigned &Alignment) {
32450   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32451   if (TrueMaskElt < 0)
32452     return false;
32453
32454   // Get the address of the one scalar element that is specified by the mask
32455   // using the appropriate offset from the base pointer.
32456   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32457   Addr = MaskedOp->getBasePtr();
32458   if (TrueMaskElt != 0) {
32459     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32460     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32461   }
32462
32463   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32464   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32465   return true;
32466 }
32467
32468 /// If exactly one element of the mask is set for a non-extending masked load,
32469 /// it is a scalar load and vector insert.
32470 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32471 /// mask have already been optimized in IR, so we don't bother with those here.
32472 static SDValue
32473 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32474                              TargetLowering::DAGCombinerInfo &DCI) {
32475   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32476   // However, some target hooks may need to be added to know when the transform
32477   // is profitable. Endianness would also have to be considered.
32478
32479   SDValue Addr, VecIndex;
32480   unsigned Alignment;
32481   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32482     return SDValue();
32483
32484   // Load the one scalar element that is specified by the mask using the
32485   // appropriate offset from the base pointer.
32486   SDLoc DL(ML);
32487   EVT VT = ML->getValueType(0);
32488   EVT EltVT = VT.getVectorElementType();
32489   SDValue Load =
32490       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32491                   Alignment, ML->getMemOperand()->getFlags());
32492
32493   // Insert the loaded element into the appropriate place in the vector.
32494   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32495                                Load, VecIndex);
32496   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32497 }
32498
32499 static SDValue
32500 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32501                               TargetLowering::DAGCombinerInfo &DCI) {
32502   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32503     return SDValue();
32504
32505   SDLoc DL(ML);
32506   EVT VT = ML->getValueType(0);
32507
32508   // If we are loading the first and last elements of a vector, it is safe and
32509   // always faster to load the whole vector. Replace the masked load with a
32510   // vector load and select.
32511   unsigned NumElts = VT.getVectorNumElements();
32512   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32513   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32514   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32515   if (LoadFirstElt && LoadLastElt) {
32516     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32517                                 ML->getMemOperand());
32518     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32519     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32520   }
32521
32522   // Convert a masked load with a constant mask into a masked load and a select.
32523   // This allows the select operation to use a faster kind of select instruction
32524   // (for example, vblendvps -> vblendps).
32525
32526   // Don't try this if the pass-through operand is already undefined. That would
32527   // cause an infinite loop because that's what we're about to create.
32528   if (ML->getSrc0().isUndef())
32529     return SDValue();
32530
32531   // The new masked load has an undef pass-through operand. The select uses the
32532   // original pass-through operand.
32533   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32534                                     ML->getMask(), DAG.getUNDEF(VT),
32535                                     ML->getMemoryVT(), ML->getMemOperand(),
32536                                     ML->getExtensionType());
32537   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32538
32539   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32540 }
32541
32542 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32543                                  TargetLowering::DAGCombinerInfo &DCI,
32544                                  const X86Subtarget &Subtarget) {
32545   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32546
32547   // TODO: Expanding load with constant mask may be optimized as well.
32548   if (Mld->isExpandingLoad())
32549     return SDValue();
32550
32551   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32552     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32553       return ScalarLoad;
32554     // TODO: Do some AVX512 subsets benefit from this transform?
32555     if (!Subtarget.hasAVX512())
32556       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32557         return Blend;
32558   }
32559
32560   if (Mld->getExtensionType() != ISD::SEXTLOAD)
32561     return SDValue();
32562
32563   // Resolve extending loads.
32564   EVT VT = Mld->getValueType(0);
32565   unsigned NumElems = VT.getVectorNumElements();
32566   EVT LdVT = Mld->getMemoryVT();
32567   SDLoc dl(Mld);
32568
32569   assert(LdVT != VT && "Cannot extend to the same type");
32570   unsigned ToSz = VT.getScalarSizeInBits();
32571   unsigned FromSz = LdVT.getScalarSizeInBits();
32572   // From/To sizes and ElemCount must be pow of two.
32573   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32574     "Unexpected size for extending masked load");
32575
32576   unsigned SizeRatio  = ToSz / FromSz;
32577   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32578
32579   // Create a type on which we perform the shuffle.
32580   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32581           LdVT.getScalarType(), NumElems*SizeRatio);
32582   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32583
32584   // Convert Src0 value.
32585   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32586   if (!Mld->getSrc0().isUndef()) {
32587     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32588     for (unsigned i = 0; i != NumElems; ++i)
32589       ShuffleVec[i] = i * SizeRatio;
32590
32591     // Can't shuffle using an illegal type.
32592     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32593            "WideVecVT should be legal");
32594     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
32595                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
32596   }
32597   // Prepare the new mask.
32598   SDValue NewMask;
32599   SDValue Mask = Mld->getMask();
32600   if (Mask.getValueType() == VT) {
32601     // Mask and original value have the same type.
32602     NewMask = DAG.getBitcast(WideVecVT, Mask);
32603     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32604     for (unsigned i = 0; i != NumElems; ++i)
32605       ShuffleVec[i] = i * SizeRatio;
32606     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
32607       ShuffleVec[i] = NumElems * SizeRatio;
32608     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32609                                    DAG.getConstant(0, dl, WideVecVT),
32610                                    ShuffleVec);
32611   } else {
32612     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32613     unsigned WidenNumElts = NumElems*SizeRatio;
32614     unsigned MaskNumElts = VT.getVectorNumElements();
32615     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
32616                                      WidenNumElts);
32617
32618     unsigned NumConcat = WidenNumElts / MaskNumElts;
32619     SmallVector<SDValue, 16> Ops(NumConcat);
32620     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32621     Ops[0] = Mask;
32622     for (unsigned i = 1; i != NumConcat; ++i)
32623       Ops[i] = ZeroVal;
32624
32625     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32626   }
32627
32628   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
32629                                      Mld->getBasePtr(), NewMask, WideSrc0,
32630                                      Mld->getMemoryVT(), Mld->getMemOperand(),
32631                                      ISD::NON_EXTLOAD);
32632   SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
32633   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
32634 }
32635
32636 /// If exactly one element of the mask is set for a non-truncating masked store,
32637 /// it is a vector extract and scalar store.
32638 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32639 /// mask have already been optimized in IR, so we don't bother with those here.
32640 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
32641                                               SelectionDAG &DAG) {
32642   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32643   // However, some target hooks may need to be added to know when the transform
32644   // is profitable. Endianness would also have to be considered.
32645
32646   SDValue Addr, VecIndex;
32647   unsigned Alignment;
32648   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
32649     return SDValue();
32650
32651   // Extract the one scalar element that is actually being stored.
32652   SDLoc DL(MS);
32653   EVT VT = MS->getValue().getValueType();
32654   EVT EltVT = VT.getVectorElementType();
32655   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
32656                                 MS->getValue(), VecIndex);
32657
32658   // Store that element at the appropriate offset from the base pointer.
32659   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
32660                       Alignment, MS->getMemOperand()->getFlags());
32661 }
32662
32663 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
32664                                   const X86Subtarget &Subtarget) {
32665   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
32666
32667   if (Mst->isCompressingStore())
32668     return SDValue();
32669
32670   if (!Mst->isTruncatingStore())
32671     return reduceMaskedStoreToScalarStore(Mst, DAG);
32672
32673   // Resolve truncating stores.
32674   EVT VT = Mst->getValue().getValueType();
32675   unsigned NumElems = VT.getVectorNumElements();
32676   EVT StVT = Mst->getMemoryVT();
32677   SDLoc dl(Mst);
32678
32679   assert(StVT != VT && "Cannot truncate to the same type");
32680   unsigned FromSz = VT.getScalarSizeInBits();
32681   unsigned ToSz = StVT.getScalarSizeInBits();
32682
32683   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32684
32685   // The truncating store is legal in some cases. For example
32686   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32687   // are designated for truncate store.
32688   // In this case we don't need any further transformations.
32689   if (TLI.isTruncStoreLegal(VT, StVT))
32690     return SDValue();
32691
32692   // From/To sizes and ElemCount must be pow of two.
32693   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32694     "Unexpected size for truncating masked store");
32695   // We are going to use the original vector elt for storing.
32696   // Accumulated smaller vector elements must be a multiple of the store size.
32697   assert (((NumElems * FromSz) % ToSz) == 0 &&
32698           "Unexpected ratio for truncating masked store");
32699
32700   unsigned SizeRatio  = FromSz / ToSz;
32701   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32702
32703   // Create a type on which we perform the shuffle.
32704   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32705           StVT.getScalarType(), NumElems*SizeRatio);
32706
32707   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32708
32709   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
32710   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
32711   for (unsigned i = 0; i != NumElems; ++i)
32712     ShuffleVec[i] = i * SizeRatio;
32713
32714   // Can't shuffle using an illegal type.
32715   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
32716          "WideVecVT should be legal");
32717
32718   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32719                                               DAG.getUNDEF(WideVecVT),
32720                                               ShuffleVec);
32721
32722   SDValue NewMask;
32723   SDValue Mask = Mst->getMask();
32724   if (Mask.getValueType() == VT) {
32725     // Mask and original value have the same type.
32726     NewMask = DAG.getBitcast(WideVecVT, Mask);
32727     for (unsigned i = 0; i != NumElems; ++i)
32728       ShuffleVec[i] = i * SizeRatio;
32729     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
32730       ShuffleVec[i] = NumElems*SizeRatio;
32731     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
32732                                    DAG.getConstant(0, dl, WideVecVT),
32733                                    ShuffleVec);
32734   } else {
32735     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
32736     unsigned WidenNumElts = NumElems*SizeRatio;
32737     unsigned MaskNumElts = VT.getVectorNumElements();
32738     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
32739                                      WidenNumElts);
32740
32741     unsigned NumConcat = WidenNumElts / MaskNumElts;
32742     SmallVector<SDValue, 16> Ops(NumConcat);
32743     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
32744     Ops[0] = Mask;
32745     for (unsigned i = 1; i != NumConcat; ++i)
32746       Ops[i] = ZeroVal;
32747
32748     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
32749   }
32750
32751   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
32752                             Mst->getBasePtr(), NewMask, StVT,
32753                             Mst->getMemOperand(), false);
32754 }
32755
32756 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
32757                             const X86Subtarget &Subtarget) {
32758   StoreSDNode *St = cast<StoreSDNode>(N);
32759   EVT VT = St->getValue().getValueType();
32760   EVT StVT = St->getMemoryVT();
32761   SDLoc dl(St);
32762   SDValue StoredVal = St->getOperand(1);
32763   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32764
32765   // If we are saving a concatenation of two XMM registers and 32-byte stores
32766   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
32767   bool Fast;
32768   unsigned AddressSpace = St->getAddressSpace();
32769   unsigned Alignment = St->getAlignment();
32770   if (VT.is256BitVector() && StVT == VT &&
32771       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
32772                              AddressSpace, Alignment, &Fast) &&
32773       !Fast) {
32774     unsigned NumElems = VT.getVectorNumElements();
32775     if (NumElems < 2)
32776       return SDValue();
32777
32778     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
32779     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
32780
32781     SDValue Ptr0 = St->getBasePtr();
32782     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
32783
32784     SDValue Ch0 =
32785         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
32786                      Alignment, St->getMemOperand()->getFlags());
32787     SDValue Ch1 =
32788         DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
32789                      std::min(16U, Alignment), St->getMemOperand()->getFlags());
32790     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
32791   }
32792
32793   // Optimize trunc store (of multiple scalars) to shuffle and store.
32794   // First, pack all of the elements in one place. Next, store to memory
32795   // in fewer chunks.
32796   if (St->isTruncatingStore() && VT.isVector()) {
32797     // Check if we can detect an AVG pattern from the truncation. If yes,
32798     // replace the trunc store by a normal store with the result of X86ISD::AVG
32799     // instruction.
32800     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
32801                                        Subtarget, dl))
32802       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
32803                           St->getPointerInfo(), St->getAlignment(),
32804                           St->getMemOperand()->getFlags());
32805
32806     if (SDValue Val =
32807         detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
32808       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
32809                              dl, Val, St->getBasePtr(),
32810                              St->getMemoryVT(), St->getMemOperand(), DAG);
32811
32812     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32813     unsigned NumElems = VT.getVectorNumElements();
32814     assert(StVT != VT && "Cannot truncate to the same type");
32815     unsigned FromSz = VT.getScalarSizeInBits();
32816     unsigned ToSz = StVT.getScalarSizeInBits();
32817
32818     // The truncating store is legal in some cases. For example
32819     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
32820     // are designated for truncate store.
32821     // In this case we don't need any further transformations.
32822     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
32823       return SDValue();
32824
32825     // From, To sizes and ElemCount must be pow of two
32826     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
32827     // We are going to use the original vector elt for storing.
32828     // Accumulated smaller vector elements must be a multiple of the store size.
32829     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
32830
32831     unsigned SizeRatio  = FromSz / ToSz;
32832
32833     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
32834
32835     // Create a type on which we perform the shuffle
32836     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32837             StVT.getScalarType(), NumElems*SizeRatio);
32838
32839     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32840
32841     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
32842     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
32843     for (unsigned i = 0; i != NumElems; ++i)
32844       ShuffleVec[i] = i * SizeRatio;
32845
32846     // Can't shuffle using an illegal type.
32847     if (!TLI.isTypeLegal(WideVecVT))
32848       return SDValue();
32849
32850     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
32851                                          DAG.getUNDEF(WideVecVT),
32852                                          ShuffleVec);
32853     // At this point all of the data is stored at the bottom of the
32854     // register. We now need to save it to mem.
32855
32856     // Find the largest store unit
32857     MVT StoreType = MVT::i8;
32858     for (MVT Tp : MVT::integer_valuetypes()) {
32859       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
32860         StoreType = Tp;
32861     }
32862
32863     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
32864     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
32865         (64 <= NumElems * ToSz))
32866       StoreType = MVT::f64;
32867
32868     // Bitcast the original vector into a vector of store-size units
32869     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
32870             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
32871     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
32872     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
32873     SmallVector<SDValue, 8> Chains;
32874     SDValue Ptr = St->getBasePtr();
32875
32876     // Perform one or more big stores into memory.
32877     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32878       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32879                                    StoreType, ShuffWide,
32880                                    DAG.getIntPtrConstant(i, dl));
32881       SDValue Ch =
32882           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32883                        St->getAlignment(), St->getMemOperand()->getFlags());
32884       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32885       Chains.push_back(Ch);
32886     }
32887
32888     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32889   }
32890
32891   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
32892   // the FP state in cases where an emms may be missing.
32893   // A preferable solution to the general problem is to figure out the right
32894   // places to insert EMMS.  This qualifies as a quick hack.
32895
32896   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32897   if (VT.getSizeInBits() != 64)
32898     return SDValue();
32899
32900   const Function *F = DAG.getMachineFunction().getFunction();
32901   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32902   bool F64IsLegal =
32903       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32904   if ((VT.isVector() ||
32905        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32906       isa<LoadSDNode>(St->getValue()) &&
32907       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32908       St->getChain().hasOneUse() && !St->isVolatile()) {
32909     SDNode* LdVal = St->getValue().getNode();
32910     LoadSDNode *Ld = nullptr;
32911     int TokenFactorIndex = -1;
32912     SmallVector<SDValue, 8> Ops;
32913     SDNode* ChainVal = St->getChain().getNode();
32914     // Must be a store of a load.  We currently handle two cases:  the load
32915     // is a direct child, and it's under an intervening TokenFactor.  It is
32916     // possible to dig deeper under nested TokenFactors.
32917     if (ChainVal == LdVal)
32918       Ld = cast<LoadSDNode>(St->getChain());
32919     else if (St->getValue().hasOneUse() &&
32920              ChainVal->getOpcode() == ISD::TokenFactor) {
32921       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32922         if (ChainVal->getOperand(i).getNode() == LdVal) {
32923           TokenFactorIndex = i;
32924           Ld = cast<LoadSDNode>(St->getValue());
32925         } else
32926           Ops.push_back(ChainVal->getOperand(i));
32927       }
32928     }
32929
32930     if (!Ld || !ISD::isNormalLoad(Ld))
32931       return SDValue();
32932
32933     // If this is not the MMX case, i.e. we are just turning i64 load/store
32934     // into f64 load/store, avoid the transformation if there are multiple
32935     // uses of the loaded value.
32936     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32937       return SDValue();
32938
32939     SDLoc LdDL(Ld);
32940     SDLoc StDL(N);
32941     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32942     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32943     // pair instead.
32944     if (Subtarget.is64Bit() || F64IsLegal) {
32945       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32946       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32947                                   Ld->getPointerInfo(), Ld->getAlignment(),
32948                                   Ld->getMemOperand()->getFlags());
32949       SDValue NewChain = NewLd.getValue(1);
32950       if (TokenFactorIndex >= 0) {
32951         Ops.push_back(NewChain);
32952         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32953       }
32954       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32955                           St->getPointerInfo(), St->getAlignment(),
32956                           St->getMemOperand()->getFlags());
32957     }
32958
32959     // Otherwise, lower to two pairs of 32-bit loads / stores.
32960     SDValue LoAddr = Ld->getBasePtr();
32961     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32962
32963     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32964                                Ld->getPointerInfo(), Ld->getAlignment(),
32965                                Ld->getMemOperand()->getFlags());
32966     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32967                                Ld->getPointerInfo().getWithOffset(4),
32968                                MinAlign(Ld->getAlignment(), 4),
32969                                Ld->getMemOperand()->getFlags());
32970
32971     SDValue NewChain = LoLd.getValue(1);
32972     if (TokenFactorIndex >= 0) {
32973       Ops.push_back(LoLd);
32974       Ops.push_back(HiLd);
32975       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32976     }
32977
32978     LoAddr = St->getBasePtr();
32979     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32980
32981     SDValue LoSt =
32982         DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32983                      St->getAlignment(), St->getMemOperand()->getFlags());
32984     SDValue HiSt = DAG.getStore(
32985         NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32986         MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32987     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32988   }
32989
32990   // This is similar to the above case, but here we handle a scalar 64-bit
32991   // integer store that is extracted from a vector on a 32-bit target.
32992   // If we have SSE2, then we can treat it like a floating-point double
32993   // to get past legalization. The execution dependencies fixup pass will
32994   // choose the optimal machine instruction for the store if this really is
32995   // an integer or v2f32 rather than an f64.
32996   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32997       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32998     SDValue OldExtract = St->getOperand(1);
32999     SDValue ExtOp0 = OldExtract.getOperand(0);
33000     unsigned VecSize = ExtOp0.getValueSizeInBits();
33001     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33002     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33003     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33004                                      BitCast, OldExtract.getOperand(1));
33005     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33006                         St->getPointerInfo(), St->getAlignment(),
33007                         St->getMemOperand()->getFlags());
33008   }
33009
33010   return SDValue();
33011 }
33012
33013 /// Return 'true' if this vector operation is "horizontal"
33014 /// and return the operands for the horizontal operation in LHS and RHS.  A
33015 /// horizontal operation performs the binary operation on successive elements
33016 /// of its first operand, then on successive elements of its second operand,
33017 /// returning the resulting values in a vector.  For example, if
33018 ///   A = < float a0, float a1, float a2, float a3 >
33019 /// and
33020 ///   B = < float b0, float b1, float b2, float b3 >
33021 /// then the result of doing a horizontal operation on A and B is
33022 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33023 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33024 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33025 /// set to A, RHS to B, and the routine returns 'true'.
33026 /// Note that the binary operation should have the property that if one of the
33027 /// operands is UNDEF then the result is UNDEF.
33028 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33029   // Look for the following pattern: if
33030   //   A = < float a0, float a1, float a2, float a3 >
33031   //   B = < float b0, float b1, float b2, float b3 >
33032   // and
33033   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33034   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33035   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33036   // which is A horizontal-op B.
33037
33038   // At least one of the operands should be a vector shuffle.
33039   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33040       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33041     return false;
33042
33043   MVT VT = LHS.getSimpleValueType();
33044
33045   assert((VT.is128BitVector() || VT.is256BitVector()) &&
33046          "Unsupported vector type for horizontal add/sub");
33047
33048   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33049   // operate independently on 128-bit lanes.
33050   unsigned NumElts = VT.getVectorNumElements();
33051   unsigned NumLanes = VT.getSizeInBits()/128;
33052   unsigned NumLaneElts = NumElts / NumLanes;
33053   assert((NumLaneElts % 2 == 0) &&
33054          "Vector type should have an even number of elements in each lane");
33055   unsigned HalfLaneElts = NumLaneElts/2;
33056
33057   // View LHS in the form
33058   //   LHS = VECTOR_SHUFFLE A, B, LMask
33059   // If LHS is not a shuffle then pretend it is the shuffle
33060   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33061   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33062   // type VT.
33063   SDValue A, B;
33064   SmallVector<int, 16> LMask(NumElts);
33065   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33066     if (!LHS.getOperand(0).isUndef())
33067       A = LHS.getOperand(0);
33068     if (!LHS.getOperand(1).isUndef())
33069       B = LHS.getOperand(1);
33070     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33071     std::copy(Mask.begin(), Mask.end(), LMask.begin());
33072   } else {
33073     if (!LHS.isUndef())
33074       A = LHS;
33075     for (unsigned i = 0; i != NumElts; ++i)
33076       LMask[i] = i;
33077   }
33078
33079   // Likewise, view RHS in the form
33080   //   RHS = VECTOR_SHUFFLE C, D, RMask
33081   SDValue C, D;
33082   SmallVector<int, 16> RMask(NumElts);
33083   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33084     if (!RHS.getOperand(0).isUndef())
33085       C = RHS.getOperand(0);
33086     if (!RHS.getOperand(1).isUndef())
33087       D = RHS.getOperand(1);
33088     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33089     std::copy(Mask.begin(), Mask.end(), RMask.begin());
33090   } else {
33091     if (!RHS.isUndef())
33092       C = RHS;
33093     for (unsigned i = 0; i != NumElts; ++i)
33094       RMask[i] = i;
33095   }
33096
33097   // Check that the shuffles are both shuffling the same vectors.
33098   if (!(A == C && B == D) && !(A == D && B == C))
33099     return false;
33100
33101   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33102   if (!A.getNode() && !B.getNode())
33103     return false;
33104
33105   // If A and B occur in reverse order in RHS, then "swap" them (which means
33106   // rewriting the mask).
33107   if (A != C)
33108     ShuffleVectorSDNode::commuteMask(RMask);
33109
33110   // At this point LHS and RHS are equivalent to
33111   //   LHS = VECTOR_SHUFFLE A, B, LMask
33112   //   RHS = VECTOR_SHUFFLE A, B, RMask
33113   // Check that the masks correspond to performing a horizontal operation.
33114   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33115     for (unsigned i = 0; i != NumLaneElts; ++i) {
33116       int LIdx = LMask[i+l], RIdx = RMask[i+l];
33117
33118       // Ignore any UNDEF components.
33119       if (LIdx < 0 || RIdx < 0 ||
33120           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33121           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33122         continue;
33123
33124       // Check that successive elements are being operated on.  If not, this is
33125       // not a horizontal operation.
33126       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33127       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33128       if (!(LIdx == Index && RIdx == Index + 1) &&
33129           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33130         return false;
33131     }
33132   }
33133
33134   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33135   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33136   return true;
33137 }
33138
33139 /// Do target-specific dag combines on floating-point adds/subs.
33140 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33141                                const X86Subtarget &Subtarget) {
33142   EVT VT = N->getValueType(0);
33143   SDValue LHS = N->getOperand(0);
33144   SDValue RHS = N->getOperand(1);
33145   bool IsFadd = N->getOpcode() == ISD::FADD;
33146   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33147
33148   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33149   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33150        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33151       isHorizontalBinOp(LHS, RHS, IsFadd)) {
33152     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33153     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33154   }
33155   return SDValue();
33156 }
33157
33158 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33159 /// the codegen.
33160 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33161 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33162                                           const X86Subtarget &Subtarget,
33163                                           SDLoc &DL) {
33164   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33165   SDValue Src = N->getOperand(0);
33166   unsigned Opcode = Src.getOpcode();
33167   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33168
33169   EVT VT = N->getValueType(0);
33170   EVT SrcVT = Src.getValueType();
33171
33172   auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33173     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33174
33175     // Repeated operand, so we are only trading one output truncation for
33176     // one input truncation.
33177     if (Op0 == Op1)
33178       return true;
33179
33180     // See if either operand has been extended from a smaller/equal size to
33181     // the truncation size, allowing a truncation to combine with the extend.
33182     unsigned Opcode0 = Op0.getOpcode();
33183     if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33184          Opcode0 == ISD::ZERO_EXTEND) &&
33185         Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33186       return true;
33187
33188     unsigned Opcode1 = Op1.getOpcode();
33189     if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33190          Opcode1 == ISD::ZERO_EXTEND) &&
33191         Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33192       return true;
33193
33194     // See if either operand is a single use constant which can be constant
33195     // folded.
33196     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33197     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33198     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33199            ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33200   };
33201
33202   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33203     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33204     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33205     return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33206   };
33207
33208   // Don't combine if the operation has other uses.
33209   if (!N->isOnlyUserOf(Src.getNode()))
33210     return SDValue();
33211
33212   // Only support vector truncation for now.
33213   // TODO: i64 scalar math would benefit as well.
33214   if (!VT.isVector())
33215     return SDValue();
33216
33217   // In most cases its only worth pre-truncating if we're only facing the cost
33218   // of one truncation.
33219   // i.e. if one of the inputs will constant fold or the input is repeated.
33220   switch (Opcode) {
33221   case ISD::AND:
33222   case ISD::XOR:
33223   case ISD::OR: {
33224     SDValue Op0 = Src.getOperand(0);
33225     SDValue Op1 = Src.getOperand(1);
33226     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33227         IsRepeatedOpOrFreeTruncation(Op0, Op1))
33228       return TruncateArithmetic(Op0, Op1);
33229     break;
33230   }
33231
33232   case ISD::MUL:
33233     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33234     // better to truncate if we have the chance.
33235     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33236         !TLI.isOperationLegal(Opcode, SrcVT))
33237       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33238     LLVM_FALLTHROUGH;
33239   case ISD::ADD: {
33240     SDValue Op0 = Src.getOperand(0);
33241     SDValue Op1 = Src.getOperand(1);
33242     if (TLI.isOperationLegal(Opcode, VT) &&
33243         IsRepeatedOpOrFreeTruncation(Op0, Op1))
33244       return TruncateArithmetic(Op0, Op1);
33245     break;
33246   }
33247   }
33248
33249   return SDValue();
33250 }
33251
33252 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33253 static SDValue
33254 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33255                                   SmallVector<SDValue, 8> &Regs) {
33256   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33257                              Regs[0].getValueType() == MVT::v2i64));
33258   EVT OutVT = N->getValueType(0);
33259   EVT OutSVT = OutVT.getVectorElementType();
33260   EVT InVT = Regs[0].getValueType();
33261   EVT InSVT = InVT.getVectorElementType();
33262   SDLoc DL(N);
33263
33264   // First, use mask to unset all bits that won't appear in the result.
33265   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33266          "OutSVT can only be either i8 or i16.");
33267   APInt Mask =
33268       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33269   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33270   for (auto &Reg : Regs)
33271     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33272
33273   MVT UnpackedVT, PackedVT;
33274   if (OutSVT == MVT::i8) {
33275     UnpackedVT = MVT::v8i16;
33276     PackedVT = MVT::v16i8;
33277   } else {
33278     UnpackedVT = MVT::v4i32;
33279     PackedVT = MVT::v8i16;
33280   }
33281
33282   // In each iteration, truncate the type by a half size.
33283   auto RegNum = Regs.size();
33284   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33285        j < e; j *= 2, RegNum /= 2) {
33286     for (unsigned i = 0; i < RegNum; i++)
33287       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33288     for (unsigned i = 0; i < RegNum / 2; i++)
33289       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33290                             Regs[i * 2 + 1]);
33291   }
33292
33293   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33294   // then extract a subvector as the result since v8i8 is not a legal type.
33295   if (OutVT == MVT::v8i8) {
33296     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33297     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33298                           DAG.getIntPtrConstant(0, DL));
33299     return Regs[0];
33300   } else if (RegNum > 1) {
33301     Regs.resize(RegNum);
33302     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33303   } else
33304     return Regs[0];
33305 }
33306
33307 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33308 static SDValue
33309 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33310                                   SelectionDAG &DAG,
33311                                   SmallVector<SDValue, 8> &Regs) {
33312   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33313   EVT OutVT = N->getValueType(0);
33314   SDLoc DL(N);
33315
33316   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33317   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33318   for (auto &Reg : Regs) {
33319     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33320                               Subtarget, DAG);
33321     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33322                               Subtarget, DAG);
33323   }
33324
33325   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33326     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33327                           Regs[i * 2 + 1]);
33328
33329   if (Regs.size() > 2) {
33330     Regs.resize(Regs.size() / 2);
33331     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33332   } else
33333     return Regs[0];
33334 }
33335
33336 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33337 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33338 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33339 /// element that is extracted from a vector and then truncated, and it is
33340 /// difficult to do this optimization based on them.
33341 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33342                                        const X86Subtarget &Subtarget) {
33343   EVT OutVT = N->getValueType(0);
33344   if (!OutVT.isVector())
33345     return SDValue();
33346
33347   SDValue In = N->getOperand(0);
33348   if (!In.getValueType().isSimple())
33349     return SDValue();
33350
33351   EVT InVT = In.getValueType();
33352   unsigned NumElems = OutVT.getVectorNumElements();
33353
33354   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33355   // SSE2, and we need to take care of it specially.
33356   // AVX512 provides vpmovdb.
33357   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33358     return SDValue();
33359
33360   EVT OutSVT = OutVT.getVectorElementType();
33361   EVT InSVT = InVT.getVectorElementType();
33362   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33363         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33364         NumElems >= 8))
33365     return SDValue();
33366
33367   // SSSE3's pshufb results in less instructions in the cases below.
33368   if (Subtarget.hasSSSE3() && NumElems == 8 &&
33369       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33370        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33371     return SDValue();
33372
33373   SDLoc DL(N);
33374
33375   // Split a long vector into vectors of legal type.
33376   unsigned RegNum = InVT.getSizeInBits() / 128;
33377   SmallVector<SDValue, 8> SubVec(RegNum);
33378   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33379   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33380
33381   for (unsigned i = 0; i < RegNum; i++)
33382     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33383                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33384
33385   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33386   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33387   // truncate 2 x v4i32 to v8i16.
33388   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33389     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33390   else if (InSVT == MVT::i32)
33391     return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33392   else
33393     return SDValue();
33394 }
33395
33396 /// This function transforms vector truncation of 'all or none' bits values.
33397 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33398 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33399                                                SelectionDAG &DAG,
33400                                                const X86Subtarget &Subtarget) {
33401   // Requires SSE2 but AVX512 has fast truncate.
33402   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33403     return SDValue();
33404
33405   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33406     return SDValue();
33407
33408   SDValue In = N->getOperand(0);
33409   if (!In.getValueType().isSimple())
33410     return SDValue();
33411
33412   MVT VT = N->getValueType(0).getSimpleVT();
33413   MVT SVT = VT.getScalarType();
33414
33415   MVT InVT = In.getValueType().getSimpleVT();
33416   MVT InSVT = InVT.getScalarType();
33417
33418   // Use PACKSS if the input is a splatted sign bit.
33419   // e.g. Comparison result, sext_in_reg, etc.
33420   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33421   if (NumSignBits != InSVT.getSizeInBits())
33422     return SDValue();
33423
33424   // Check we have a truncation suited for PACKSS.
33425   if (!VT.is128BitVector() && !VT.is256BitVector())
33426     return SDValue();
33427   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33428     return SDValue();
33429   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33430     return SDValue();
33431
33432   return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33433 }
33434
33435 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33436                                const X86Subtarget &Subtarget) {
33437   EVT VT = N->getValueType(0);
33438   SDValue Src = N->getOperand(0);
33439   SDLoc DL(N);
33440
33441   // Attempt to pre-truncate inputs to arithmetic ops instead.
33442   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33443     return V;
33444
33445   // Try to detect AVG pattern first.
33446   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33447     return Avg;
33448
33449   // Try to combine truncation with unsigned saturation.
33450   if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33451     return Val;
33452
33453   // The bitcast source is a direct mmx result.
33454   // Detect bitcasts between i32 to x86mmx
33455   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33456     SDValue BCSrc = Src.getOperand(0);
33457     if (BCSrc.getValueType() == MVT::x86mmx)
33458       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33459   }
33460
33461   // Try to truncate extended sign bits with PACKSS.
33462   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33463     return V;
33464
33465   return combineVectorTruncation(N, DAG, Subtarget);
33466 }
33467
33468 /// Returns the negated value if the node \p N flips sign of FP value.
33469 ///
33470 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33471 /// AVX512F does not have FXOR, so FNEG is lowered as
33472 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33473 /// In this case we go though all bitcasts.
33474 static SDValue isFNEG(SDNode *N) {
33475   if (N->getOpcode() == ISD::FNEG)
33476     return N->getOperand(0);
33477
33478   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33479   if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33480     return SDValue();
33481
33482   SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33483   if (!Op1.getValueType().isFloatingPoint())
33484     return SDValue();
33485
33486   SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33487
33488   unsigned EltBits = Op1.getScalarValueSizeInBits();
33489   auto isSignMask = [&](const ConstantFP *C) {
33490     return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33491   };
33492
33493   // There is more than one way to represent the same constant on
33494   // the different X86 targets. The type of the node may also depend on size.
33495   //  - load scalar value and broadcast
33496   //  - BUILD_VECTOR node
33497   //  - load from a constant pool.
33498   // We check all variants here.
33499   if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33500     if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33501       if (isSignMask(cast<ConstantFP>(C)))
33502         return Op0;
33503
33504   } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33505     if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33506       if (isSignMask(CN->getConstantFPValue()))
33507         return Op0;
33508
33509   } else if (auto *C = getTargetConstantFromNode(Op1)) {
33510     if (C->getType()->isVectorTy()) {
33511       if (auto *SplatV = C->getSplatValue())
33512         if (isSignMask(cast<ConstantFP>(SplatV)))
33513           return Op0;
33514     } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33515       if (isSignMask(FPConst))
33516         return Op0;
33517   }
33518   return SDValue();
33519 }
33520
33521 /// Do target-specific dag combines on floating point negations.
33522 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33523                            const X86Subtarget &Subtarget) {
33524   EVT OrigVT = N->getValueType(0);
33525   SDValue Arg = isFNEG(N);
33526   assert(Arg.getNode() && "N is expected to be an FNEG node");
33527
33528   EVT VT = Arg.getValueType();
33529   EVT SVT = VT.getScalarType();
33530   SDLoc DL(N);
33531
33532   // Let legalize expand this if it isn't a legal type yet.
33533   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33534     return SDValue();
33535
33536   // If we're negating a FMUL node on a target with FMA, then we can avoid the
33537   // use of a constant by performing (-0 - A*B) instead.
33538   // FIXME: Check rounding control flags as well once it becomes available.
33539   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33540       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33541     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33542     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33543                                   Arg.getOperand(1), Zero);
33544     return DAG.getBitcast(OrigVT, NewNode);
33545   }
33546
33547   // If we're negating an FMA node, then we can adjust the
33548   // instruction to include the extra negation.
33549   unsigned NewOpcode = 0;
33550   if (Arg.hasOneUse()) {
33551     switch (Arg.getOpcode()) {
33552     case X86ISD::FMADD:        NewOpcode = X86ISD::FNMSUB;       break;
33553     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
33554     case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
33555     case X86ISD::FNMSUB:       NewOpcode = X86ISD::FMADD;        break;
33556     case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
33557     case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
33558     case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
33559     case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
33560     // We can't handle scalar intrinsic node here because it would only
33561     // invert one element and not the whole vector. But we could try to handle
33562     // a negation of the lower element only.
33563     }
33564   }
33565   if (NewOpcode)
33566     return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33567                                               Arg.getNode()->ops()));
33568
33569   return SDValue();
33570 }
33571
33572 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33573                                  const X86Subtarget &Subtarget) {
33574   MVT VT = N->getSimpleValueType(0);
33575   // If we have integer vector types available, use the integer opcodes.
33576   if (VT.isVector() && Subtarget.hasSSE2()) {
33577     SDLoc dl(N);
33578
33579     MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33580
33581     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33582     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33583     unsigned IntOpcode;
33584     switch (N->getOpcode()) {
33585     default: llvm_unreachable("Unexpected FP logic op");
33586     case X86ISD::FOR: IntOpcode = ISD::OR; break;
33587     case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
33588     case X86ISD::FAND: IntOpcode = ISD::AND; break;
33589     case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
33590     }
33591     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
33592     return DAG.getBitcast(VT, IntOp);
33593   }
33594   return SDValue();
33595 }
33596
33597 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
33598                           TargetLowering::DAGCombinerInfo &DCI,
33599                           const X86Subtarget &Subtarget) {
33600   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
33601     return Cmp;
33602
33603   if (DCI.isBeforeLegalizeOps())
33604     return SDValue();
33605
33606   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
33607     return RV;
33608
33609   if (Subtarget.hasCMov())
33610     if (SDValue RV = combineIntegerAbs(N, DAG))
33611       return RV;
33612
33613   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33614     return FPLogic;
33615
33616   if (isFNEG(N))
33617     return combineFneg(N, DAG, Subtarget);
33618   return SDValue();
33619 }
33620
33621
33622 static bool isNullFPScalarOrVectorConst(SDValue V) {
33623   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
33624 }
33625
33626 /// If a value is a scalar FP zero or a vector FP zero (potentially including
33627 /// undefined elements), return a zero constant that may be used to fold away
33628 /// that value. In the case of a vector, the returned constant will not contain
33629 /// undefined elements even if the input parameter does. This makes it suitable
33630 /// to be used as a replacement operand with operations (eg, bitwise-and) where
33631 /// an undef should not propagate.
33632 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
33633                                         const X86Subtarget &Subtarget) {
33634   if (!isNullFPScalarOrVectorConst(V))
33635     return SDValue();
33636
33637   if (V.getValueType().isVector())
33638     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
33639
33640   return V;
33641 }
33642
33643 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
33644                                       const X86Subtarget &Subtarget) {
33645   SDValue N0 = N->getOperand(0);
33646   SDValue N1 = N->getOperand(1);
33647   EVT VT = N->getValueType(0);
33648   SDLoc DL(N);
33649
33650   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
33651   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
33652         (VT == MVT::f64 && Subtarget.hasSSE2())))
33653     return SDValue();
33654
33655   auto isAllOnesConstantFP = [](SDValue V) {
33656     auto *C = dyn_cast<ConstantFPSDNode>(V);
33657     return C && C->getConstantFPValue()->isAllOnesValue();
33658   };
33659
33660   // fand (fxor X, -1), Y --> fandn X, Y
33661   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
33662     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
33663
33664   // fand X, (fxor Y, -1) --> fandn Y, X
33665   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
33666     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
33667
33668   return SDValue();
33669 }
33670
33671 /// Do target-specific dag combines on X86ISD::FAND nodes.
33672 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
33673                            const X86Subtarget &Subtarget) {
33674   // FAND(0.0, x) -> 0.0
33675   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
33676     return V;
33677
33678   // FAND(x, 0.0) -> 0.0
33679   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33680     return V;
33681
33682   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
33683     return V;
33684
33685   return lowerX86FPLogicOp(N, DAG, Subtarget);
33686 }
33687
33688 /// Do target-specific dag combines on X86ISD::FANDN nodes.
33689 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
33690                             const X86Subtarget &Subtarget) {
33691   // FANDN(0.0, x) -> x
33692   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33693     return N->getOperand(1);
33694
33695   // FANDN(x, 0.0) -> 0.0
33696   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
33697     return V;
33698
33699   return lowerX86FPLogicOp(N, DAG, Subtarget);
33700 }
33701
33702 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
33703 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
33704                           const X86Subtarget &Subtarget) {
33705   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
33706
33707   // F[X]OR(0.0, x) -> x
33708   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
33709     return N->getOperand(1);
33710
33711   // F[X]OR(x, 0.0) -> x
33712   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
33713     return N->getOperand(0);
33714
33715   if (isFNEG(N))
33716     if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
33717       return NewVal;
33718
33719   return lowerX86FPLogicOp(N, DAG, Subtarget);
33720 }
33721
33722 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
33723 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
33724   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
33725
33726   // Only perform optimizations if UnsafeMath is used.
33727   if (!DAG.getTarget().Options.UnsafeFPMath)
33728     return SDValue();
33729
33730   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
33731   // into FMINC and FMAXC, which are Commutative operations.
33732   unsigned NewOp = 0;
33733   switch (N->getOpcode()) {
33734     default: llvm_unreachable("unknown opcode");
33735     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
33736     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
33737   }
33738
33739   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
33740                      N->getOperand(0), N->getOperand(1));
33741 }
33742
33743 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
33744                                      const X86Subtarget &Subtarget) {
33745   if (Subtarget.useSoftFloat())
33746     return SDValue();
33747
33748   // TODO: Check for global or instruction-level "nnan". In that case, we
33749   //       should be able to lower to FMAX/FMIN alone.
33750   // TODO: If an operand is already known to be a NaN or not a NaN, this
33751   //       should be an optional swap and FMAX/FMIN.
33752
33753   EVT VT = N->getValueType(0);
33754   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
33755         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
33756         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
33757     return SDValue();
33758
33759   // This takes at least 3 instructions, so favor a library call when operating
33760   // on a scalar and minimizing code size.
33761   if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
33762     return SDValue();
33763
33764   SDValue Op0 = N->getOperand(0);
33765   SDValue Op1 = N->getOperand(1);
33766   SDLoc DL(N);
33767   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
33768       DAG.getDataLayout(), *DAG.getContext(), VT);
33769
33770   // There are 4 possibilities involving NaN inputs, and these are the required
33771   // outputs:
33772   //                   Op1
33773   //               Num     NaN
33774   //            ----------------
33775   //       Num  |  Max  |  Op0 |
33776   // Op0        ----------------
33777   //       NaN  |  Op1  |  NaN |
33778   //            ----------------
33779   //
33780   // The SSE FP max/min instructions were not designed for this case, but rather
33781   // to implement:
33782   //   Min = Op1 < Op0 ? Op1 : Op0
33783   //   Max = Op1 > Op0 ? Op1 : Op0
33784   //
33785   // So they always return Op0 if either input is a NaN. However, we can still
33786   // use those instructions for fmaxnum by selecting away a NaN input.
33787
33788   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
33789   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
33790   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
33791   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
33792
33793   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
33794   // are NaN, the NaN value of Op1 is the result.
33795   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
33796 }
33797
33798 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
33799 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
33800                             TargetLowering::DAGCombinerInfo &DCI,
33801                             const X86Subtarget &Subtarget) {
33802   // ANDNP(0, x) -> x
33803   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
33804     return N->getOperand(1);
33805
33806   // ANDNP(x, 0) -> 0
33807   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
33808     return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
33809
33810   EVT VT = N->getValueType(0);
33811
33812   // Attempt to recursively combine a bitmask ANDNP with shuffles.
33813   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33814     SDValue Op(N, 0);
33815     SmallVector<int, 1> NonceMask; // Just a placeholder.
33816     NonceMask.push_back(0);
33817     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
33818                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
33819                                       DCI, Subtarget))
33820       return SDValue(); // This routine will use CombineTo to replace N.
33821   }
33822
33823   return SDValue();
33824 }
33825
33826 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
33827                          TargetLowering::DAGCombinerInfo &DCI) {
33828   // BT ignores high bits in the bit index operand.
33829   SDValue Op1 = N->getOperand(1);
33830   if (Op1.hasOneUse()) {
33831     unsigned BitWidth = Op1.getValueSizeInBits();
33832     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
33833     KnownBits Known;
33834     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
33835                                           !DCI.isBeforeLegalizeOps());
33836     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33837     if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
33838         TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
33839       DCI.CommitTargetLoweringOpt(TLO);
33840   }
33841   return SDValue();
33842 }
33843
33844 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
33845                                       const X86Subtarget &Subtarget) {
33846   EVT VT = N->getValueType(0);
33847   if (!VT.isVector())
33848     return SDValue();
33849
33850   SDValue N0 = N->getOperand(0);
33851   SDValue N1 = N->getOperand(1);
33852   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
33853   SDLoc dl(N);
33854
33855   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
33856   // both SSE and AVX2 since there is no sign-extended shift right
33857   // operation on a vector with 64-bit elements.
33858   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
33859   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
33860   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
33861       N0.getOpcode() == ISD::SIGN_EXTEND)) {
33862     SDValue N00 = N0.getOperand(0);
33863
33864     // EXTLOAD has a better solution on AVX2,
33865     // it may be replaced with X86ISD::VSEXT node.
33866     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
33867       if (!ISD::isNormalLoad(N00.getNode()))
33868         return SDValue();
33869
33870     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
33871         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
33872                                   N00, N1);
33873       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
33874     }
33875   }
33876   return SDValue();
33877 }
33878
33879 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
33880 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
33881 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
33882 /// opportunities to combine math ops, use an LEA, or use a complex addressing
33883 /// mode. This can eliminate extend, add, and shift instructions.
33884 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
33885                                    const X86Subtarget &Subtarget) {
33886   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
33887       Ext->getOpcode() != ISD::ZERO_EXTEND)
33888     return SDValue();
33889
33890   // TODO: This should be valid for other integer types.
33891   EVT VT = Ext->getValueType(0);
33892   if (VT != MVT::i64)
33893     return SDValue();
33894
33895   SDValue Add = Ext->getOperand(0);
33896   if (Add.getOpcode() != ISD::ADD)
33897     return SDValue();
33898
33899   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
33900   bool NSW = Add->getFlags().hasNoSignedWrap();
33901   bool NUW = Add->getFlags().hasNoUnsignedWrap();
33902
33903   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
33904   // into the 'zext'
33905   if ((Sext && !NSW) || (!Sext && !NUW))
33906     return SDValue();
33907
33908   // Having a constant operand to the 'add' ensures that we are not increasing
33909   // the instruction count because the constant is extended for free below.
33910   // A constant operand can also become the displacement field of an LEA.
33911   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
33912   if (!AddOp1)
33913     return SDValue();
33914
33915   // Don't make the 'add' bigger if there's no hope of combining it with some
33916   // other 'add' or 'shl' instruction.
33917   // TODO: It may be profitable to generate simpler LEA instructions in place
33918   // of single 'add' instructions, but the cost model for selecting an LEA
33919   // currently has a high threshold.
33920   bool HasLEAPotential = false;
33921   for (auto *User : Ext->uses()) {
33922     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33923       HasLEAPotential = true;
33924       break;
33925     }
33926   }
33927   if (!HasLEAPotential)
33928     return SDValue();
33929
33930   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33931   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33932   SDValue AddOp0 = Add.getOperand(0);
33933   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33934   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33935
33936   // The wider add is guaranteed to not wrap because both operands are
33937   // sign-extended.
33938   SDNodeFlags Flags;
33939   Flags.setNoSignedWrap(NSW);
33940   Flags.setNoUnsignedWrap(NUW);
33941   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
33942 }
33943
33944 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33945 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33946 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33947 /// extends from AH (which we otherwise need to do contortions to access).
33948 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33949   SDValue N0 = N->getOperand(0);
33950   auto OpcodeN = N->getOpcode();
33951   auto OpcodeN0 = N0.getOpcode();
33952   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33953         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33954     return SDValue();
33955
33956   EVT VT = N->getValueType(0);
33957   EVT InVT = N0.getValueType();
33958   if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33959     return SDValue();
33960
33961   SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33962   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33963                                                : X86ISD::UDIVREM8_ZEXT_HREG;
33964   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33965                           N0.getOperand(1));
33966   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33967   return R.getValue(1);
33968 }
33969
33970 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33971 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33972 /// with UNDEFs) of the input to vectors of the same size as the target type
33973 /// which then extends the lowest elements.
33974 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33975                                           TargetLowering::DAGCombinerInfo &DCI,
33976                                           const X86Subtarget &Subtarget) {
33977   unsigned Opcode = N->getOpcode();
33978   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33979     return SDValue();
33980   if (!DCI.isBeforeLegalizeOps())
33981     return SDValue();
33982   if (!Subtarget.hasSSE2())
33983     return SDValue();
33984
33985   SDValue N0 = N->getOperand(0);
33986   EVT VT = N->getValueType(0);
33987   EVT SVT = VT.getScalarType();
33988   EVT InVT = N0.getValueType();
33989   EVT InSVT = InVT.getScalarType();
33990
33991   // Input type must be a vector and we must be extending legal integer types.
33992   if (!VT.isVector())
33993     return SDValue();
33994   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33995     return SDValue();
33996   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33997     return SDValue();
33998
33999   // On AVX2+ targets, if the input/output types are both legal then we will be
34000   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34001   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34002       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34003     return SDValue();
34004
34005   SDLoc DL(N);
34006
34007   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34008     EVT InVT = N.getValueType();
34009     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34010                                  Size / InVT.getScalarSizeInBits());
34011     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34012                                   DAG.getUNDEF(InVT));
34013     Opnds[0] = N;
34014     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34015   };
34016
34017   // If target-size is less than 128-bits, extend to a type that would extend
34018   // to 128 bits, extend that and extract the original target vector.
34019   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34020     unsigned Scale = 128 / VT.getSizeInBits();
34021     EVT ExVT =
34022         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34023     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34024     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34025     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34026                        DAG.getIntPtrConstant(0, DL));
34027   }
34028
34029   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34030   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34031   // Also use this if we don't have SSE41 to allow the legalizer do its job.
34032   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34033       (VT.is256BitVector() && Subtarget.hasInt256()) ||
34034       (VT.is512BitVector() && Subtarget.hasAVX512())) {
34035     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34036     return Opcode == ISD::SIGN_EXTEND
34037                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34038                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34039   }
34040
34041   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34042     unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34043     unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34044     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34045     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34046
34047     SmallVector<SDValue, 8> Opnds;
34048     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34049       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34050                                    DAG.getIntPtrConstant(Offset, DL));
34051       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34052       SrcVec = Opcode == ISD::SIGN_EXTEND
34053                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34054                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34055       Opnds.push_back(SrcVec);
34056     }
34057     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34058   };
34059
34060   // On pre-AVX2 targets, split into 128-bit nodes of
34061   // ISD::*_EXTEND_VECTOR_INREG.
34062   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34063     return SplitAndExtendInReg(128);
34064
34065   // On pre-AVX512 targets, split into 256-bit nodes of
34066   // ISD::*_EXTEND_VECTOR_INREG.
34067   if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34068     return SplitAndExtendInReg(256);
34069
34070   return SDValue();
34071 }
34072
34073 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34074                            TargetLowering::DAGCombinerInfo &DCI,
34075                            const X86Subtarget &Subtarget) {
34076   SDValue N0 = N->getOperand(0);
34077   EVT VT = N->getValueType(0);
34078   EVT InVT = N0.getValueType();
34079   SDLoc DL(N);
34080
34081   if (SDValue DivRem8 = getDivRem8(N, DAG))
34082     return DivRem8;
34083
34084   if (!DCI.isBeforeLegalizeOps()) {
34085     if (InVT == MVT::i1) {
34086       SDValue Zero = DAG.getConstant(0, DL, VT);
34087       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34088       return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34089     }
34090     return SDValue();
34091   }
34092
34093   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34094       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34095     // Invert and sign-extend a boolean is the same as zero-extend and subtract
34096     // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34097     // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34098     // sext (xor Bool, -1) --> sub (zext Bool), 1
34099     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34100     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34101   }
34102
34103   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34104     return V;
34105
34106   if (Subtarget.hasAVX() && VT.is256BitVector())
34107     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34108       return R;
34109
34110   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34111     return NewAdd;
34112
34113   return SDValue();
34114 }
34115
34116 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34117                           const X86Subtarget &Subtarget) {
34118   SDLoc dl(N);
34119   EVT VT = N->getValueType(0);
34120
34121   // Let legalize expand this if it isn't a legal type yet.
34122   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34123     return SDValue();
34124
34125   EVT ScalarVT = VT.getScalarType();
34126   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34127     return SDValue();
34128
34129   SDValue A = N->getOperand(0);
34130   SDValue B = N->getOperand(1);
34131   SDValue C = N->getOperand(2);
34132
34133   auto invertIfNegative = [](SDValue &V) {
34134     if (SDValue NegVal = isFNEG(V.getNode())) {
34135       V = NegVal;
34136       return true;
34137     }
34138     return false;
34139   };
34140
34141   // Do not convert the passthru input of scalar intrinsics.
34142   // FIXME: We could allow negations of the lower element only.
34143   bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34144   bool NegB = invertIfNegative(B);
34145   bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34146
34147   // Negative multiplication when NegA xor NegB
34148   bool NegMul = (NegA != NegB);
34149
34150   unsigned NewOpcode;
34151   if (!NegMul)
34152     NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34153   else
34154     NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34155
34156
34157   if (N->getOpcode() == X86ISD::FMADD_RND) {
34158     switch (NewOpcode) {
34159     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADD_RND; break;
34160     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
34161     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34162     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34163     }
34164   } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34165     switch (NewOpcode) {
34166     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS1_RND; break;
34167     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
34168     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34169     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34170     }
34171   } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34172     switch (NewOpcode) {
34173     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS3_RND; break;
34174     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
34175     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34176     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34177     }
34178   } else {
34179     assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34180            "Unexpected opcode!");
34181     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34182   }
34183
34184   return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34185 }
34186
34187 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34188                            TargetLowering::DAGCombinerInfo &DCI,
34189                            const X86Subtarget &Subtarget) {
34190   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
34191   //           (and (i32 x86isd::setcc_carry), 1)
34192   // This eliminates the zext. This transformation is necessary because
34193   // ISD::SETCC is always legalized to i8.
34194   SDLoc dl(N);
34195   SDValue N0 = N->getOperand(0);
34196   EVT VT = N->getValueType(0);
34197
34198   if (N0.getOpcode() == ISD::AND &&
34199       N0.hasOneUse() &&
34200       N0.getOperand(0).hasOneUse()) {
34201     SDValue N00 = N0.getOperand(0);
34202     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34203       if (!isOneConstant(N0.getOperand(1)))
34204         return SDValue();
34205       return DAG.getNode(ISD::AND, dl, VT,
34206                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34207                                      N00.getOperand(0), N00.getOperand(1)),
34208                          DAG.getConstant(1, dl, VT));
34209     }
34210   }
34211
34212   if (N0.getOpcode() == ISD::TRUNCATE &&
34213       N0.hasOneUse() &&
34214       N0.getOperand(0).hasOneUse()) {
34215     SDValue N00 = N0.getOperand(0);
34216     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34217       return DAG.getNode(ISD::AND, dl, VT,
34218                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34219                                      N00.getOperand(0), N00.getOperand(1)),
34220                          DAG.getConstant(1, dl, VT));
34221     }
34222   }
34223
34224   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34225     return V;
34226
34227   if (VT.is256BitVector())
34228     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34229       return R;
34230
34231   if (SDValue DivRem8 = getDivRem8(N, DAG))
34232     return DivRem8;
34233
34234   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34235     return NewAdd;
34236
34237   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34238     return R;
34239
34240   return SDValue();
34241 }
34242
34243 /// Try to map a 128-bit or larger integer comparison to vector instructions
34244 /// before type legalization splits it up into chunks.
34245 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34246                                                const X86Subtarget &Subtarget) {
34247   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34248   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34249
34250   // We're looking for an oversized integer equality comparison, but ignore a
34251   // comparison with zero because that gets special treatment in EmitTest().
34252   SDValue X = SetCC->getOperand(0);
34253   SDValue Y = SetCC->getOperand(1);
34254   EVT OpVT = X.getValueType();
34255   unsigned OpSize = OpVT.getSizeInBits();
34256   if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34257     return SDValue();
34258
34259   // TODO: Use PXOR + PTEST for SSE4.1 or later?
34260   // TODO: Add support for AVX-512.
34261   EVT VT = SetCC->getValueType(0);
34262   SDLoc DL(SetCC);
34263   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34264       (OpSize == 256 && Subtarget.hasAVX2())) {
34265     EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34266     SDValue VecX = DAG.getBitcast(VecVT, X);
34267     SDValue VecY = DAG.getBitcast(VecVT, Y);
34268
34269     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34270     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34271     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34272     // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34273     // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34274     SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34275     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34276     SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34277                                     MVT::i32);
34278     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34279   }
34280
34281   return SDValue();
34282 }
34283
34284 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34285                             const X86Subtarget &Subtarget) {
34286   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34287   SDValue LHS = N->getOperand(0);
34288   SDValue RHS = N->getOperand(1);
34289   EVT VT = N->getValueType(0);
34290   SDLoc DL(N);
34291
34292   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34293     EVT OpVT = LHS.getValueType();
34294     // 0-x == y --> x+y == 0
34295     // 0-x != y --> x+y != 0
34296     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34297         LHS.hasOneUse()) {
34298       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34299       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34300     }
34301     // x == 0-y --> x+y == 0
34302     // x != 0-y --> x+y != 0
34303     if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34304         RHS.hasOneUse()) {
34305       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34306       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34307     }
34308
34309     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34310       return V;
34311   }
34312
34313   if (VT.getScalarType() == MVT::i1 &&
34314       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34315     bool IsSEXT0 =
34316         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34317         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34318     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34319
34320     if (!IsSEXT0 || !IsVZero1) {
34321       // Swap the operands and update the condition code.
34322       std::swap(LHS, RHS);
34323       CC = ISD::getSetCCSwappedOperands(CC);
34324
34325       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34326                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34327       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34328     }
34329
34330     if (IsSEXT0 && IsVZero1) {
34331       assert(VT == LHS.getOperand(0).getValueType() &&
34332              "Uexpected operand type");
34333       if (CC == ISD::SETGT)
34334         return DAG.getConstant(0, DL, VT);
34335       if (CC == ISD::SETLE)
34336         return DAG.getConstant(1, DL, VT);
34337       if (CC == ISD::SETEQ || CC == ISD::SETGE)
34338         return DAG.getNOT(DL, LHS.getOperand(0), VT);
34339
34340       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34341              "Unexpected condition code!");
34342       return LHS.getOperand(0);
34343     }
34344   }
34345
34346   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34347   // to avoid scalarization via legalization because v4i32 is not a legal type.
34348   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34349       LHS.getValueType() == MVT::v4f32)
34350     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34351
34352   return SDValue();
34353 }
34354
34355 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34356   SDLoc DL(N);
34357   // Gather and Scatter instructions use k-registers for masks. The type of
34358   // the masks is v*i1. So the mask will be truncated anyway.
34359   // The SIGN_EXTEND_INREG my be dropped.
34360   SDValue Mask = N->getOperand(2);
34361   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34362     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34363     NewOps[2] = Mask.getOperand(0);
34364     DAG.UpdateNodeOperands(N, NewOps);
34365   }
34366   return SDValue();
34367 }
34368
34369 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34370 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34371                                const X86Subtarget &Subtarget) {
34372   SDLoc DL(N);
34373   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34374   SDValue EFLAGS = N->getOperand(1);
34375
34376   // Try to simplify the EFLAGS and condition code operands.
34377   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34378     return getSETCC(CC, Flags, DL, DAG);
34379
34380   return SDValue();
34381 }
34382
34383 /// Optimize branch condition evaluation.
34384 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34385                              const X86Subtarget &Subtarget) {
34386   SDLoc DL(N);
34387   SDValue EFLAGS = N->getOperand(3);
34388   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34389
34390   // Try to simplify the EFLAGS and condition code operands.
34391   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34392   // RAUW them under us.
34393   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34394     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34395     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34396                        N->getOperand(1), Cond, Flags);
34397   }
34398
34399   return SDValue();
34400 }
34401
34402 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34403                                                   SelectionDAG &DAG) {
34404   // Take advantage of vector comparisons producing 0 or -1 in each lane to
34405   // optimize away operation when it's from a constant.
34406   //
34407   // The general transformation is:
34408   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34409   //       AND(VECTOR_CMP(x,y), constant2)
34410   //    constant2 = UNARYOP(constant)
34411
34412   // Early exit if this isn't a vector operation, the operand of the
34413   // unary operation isn't a bitwise AND, or if the sizes of the operations
34414   // aren't the same.
34415   EVT VT = N->getValueType(0);
34416   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34417       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34418       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34419     return SDValue();
34420
34421   // Now check that the other operand of the AND is a constant. We could
34422   // make the transformation for non-constant splats as well, but it's unclear
34423   // that would be a benefit as it would not eliminate any operations, just
34424   // perform one more step in scalar code before moving to the vector unit.
34425   if (BuildVectorSDNode *BV =
34426           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34427     // Bail out if the vector isn't a constant.
34428     if (!BV->isConstant())
34429       return SDValue();
34430
34431     // Everything checks out. Build up the new and improved node.
34432     SDLoc DL(N);
34433     EVT IntVT = BV->getValueType(0);
34434     // Create a new constant of the appropriate type for the transformed
34435     // DAG.
34436     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34437     // The AND node needs bitcasts to/from an integer vector type around it.
34438     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34439     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34440                                  N->getOperand(0)->getOperand(0), MaskConst);
34441     SDValue Res = DAG.getBitcast(VT, NewAnd);
34442     return Res;
34443   }
34444
34445   return SDValue();
34446 }
34447
34448 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34449                                const X86Subtarget &Subtarget) {
34450   SDValue Op0 = N->getOperand(0);
34451   EVT VT = N->getValueType(0);
34452   EVT InVT = Op0.getValueType();
34453   EVT InSVT = InVT.getScalarType();
34454   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34455
34456   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34457   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34458   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34459     SDLoc dl(N);
34460     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34461                                  InVT.getVectorNumElements());
34462     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34463
34464     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34465       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34466
34467     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34468   }
34469
34470   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34471   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34472   // the optimization here.
34473   if (DAG.SignBitIsZero(Op0))
34474     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34475
34476   return SDValue();
34477 }
34478
34479 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34480                                const X86Subtarget &Subtarget) {
34481   // First try to optimize away the conversion entirely when it's
34482   // conditionally from a constant. Vectors only.
34483   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34484     return Res;
34485
34486   // Now move on to more general possibilities.
34487   SDValue Op0 = N->getOperand(0);
34488   EVT VT = N->getValueType(0);
34489   EVT InVT = Op0.getValueType();
34490   EVT InSVT = InVT.getScalarType();
34491
34492   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34493   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34494   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34495   if (InVT.isVector() &&
34496       (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34497        (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34498     SDLoc dl(N);
34499     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34500                                  InVT.getVectorNumElements());
34501     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34502     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34503   }
34504
34505   // Without AVX512DQ we only support i64 to float scalar conversion. For both
34506   // vectors and scalars, see if we know that the upper bits are all the sign
34507   // bit, in which case we can truncate the input to i32 and convert from that.
34508   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34509     unsigned BitWidth = InVT.getScalarSizeInBits();
34510     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34511     if (NumSignBits >= (BitWidth - 31)) {
34512       EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34513       if (InVT.isVector())
34514         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34515                                    InVT.getVectorNumElements());
34516       SDLoc dl(N);
34517       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34518       return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34519     }
34520   }
34521
34522   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34523   // a 32-bit target where SSE doesn't support i64->FP operations.
34524   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34525     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34526     EVT LdVT = Ld->getValueType(0);
34527
34528     // This transformation is not supported if the result type is f16 or f128.
34529     if (VT == MVT::f16 || VT == MVT::f128)
34530       return SDValue();
34531
34532     if (!Ld->isVolatile() && !VT.isVector() &&
34533         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34534         !Subtarget.is64Bit() && LdVT == MVT::i64) {
34535       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34536           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34537       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34538       return FILDChain;
34539     }
34540   }
34541   return SDValue();
34542 }
34543
34544 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34545 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34546                              X86TargetLowering::DAGCombinerInfo &DCI) {
34547   // When legalizing carry, we create carries via add X, -1
34548   // If that comes from an actual carry, via setcc, we use the
34549   // carry directly.
34550   if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34551     SDValue Carry = N->getOperand(0);
34552     while (Carry.getOpcode() == ISD::TRUNCATE ||
34553            Carry.getOpcode() == ISD::ZERO_EXTEND ||
34554            Carry.getOpcode() == ISD::SIGN_EXTEND ||
34555            Carry.getOpcode() == ISD::ANY_EXTEND ||
34556            (Carry.getOpcode() == ISD::AND &&
34557             isOneConstant(Carry.getOperand(1))))
34558       Carry = Carry.getOperand(0);
34559
34560     if (Carry.getOpcode() == X86ISD::SETCC ||
34561         Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34562       if (Carry.getConstantOperandVal(0) == X86::COND_B)
34563         return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34564     }
34565   }
34566
34567   return SDValue();
34568 }
34569
34570 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34571 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34572                           X86TargetLowering::DAGCombinerInfo &DCI) {
34573   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34574   // the result is either zero or one (depending on the input carry bit).
34575   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34576   if (X86::isZeroNode(N->getOperand(0)) &&
34577       X86::isZeroNode(N->getOperand(1)) &&
34578       // We don't have a good way to replace an EFLAGS use, so only do this when
34579       // dead right now.
34580       SDValue(N, 1).use_empty()) {
34581     SDLoc DL(N);
34582     EVT VT = N->getValueType(0);
34583     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34584     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34585                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34586                                            DAG.getConstant(X86::COND_B, DL,
34587                                                            MVT::i8),
34588                                            N->getOperand(2)),
34589                                DAG.getConstant(1, DL, VT));
34590     return DCI.CombineTo(N, Res1, CarryOut);
34591   }
34592
34593   return SDValue();
34594 }
34595
34596 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
34597 /// which is more useful than 0/1 in some cases.
34598 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
34599   SDLoc DL(N);
34600   // "Condition code B" is also known as "the carry flag" (CF).
34601   SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
34602   SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
34603   MVT VT = N->getSimpleValueType(0);
34604   if (VT == MVT::i8)
34605     return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
34606
34607   assert(VT == MVT::i1 && "Unexpected type for SETCC node");
34608   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
34609 }
34610
34611 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
34612 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
34613 /// with CMP+{ADC, SBB}.
34614 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
34615   bool IsSub = N->getOpcode() == ISD::SUB;
34616   SDValue X = N->getOperand(0);
34617   SDValue Y = N->getOperand(1);
34618
34619   // If this is an add, canonicalize a zext operand to the RHS.
34620   // TODO: Incomplete? What if both sides are zexts?
34621   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
34622       Y.getOpcode() != ISD::ZERO_EXTEND)
34623     std::swap(X, Y);
34624
34625   // Look through a one-use zext.
34626   bool PeekedThroughZext = false;
34627   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
34628     Y = Y.getOperand(0);
34629     PeekedThroughZext = true;
34630   }
34631
34632   // If this is an add, canonicalize a setcc operand to the RHS.
34633   // TODO: Incomplete? What if both sides are setcc?
34634   // TODO: Should we allow peeking through a zext of the other operand?
34635   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
34636       Y.getOpcode() != X86ISD::SETCC)
34637     std::swap(X, Y);
34638
34639   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
34640     return SDValue();
34641
34642   SDLoc DL(N);
34643   EVT VT = N->getValueType(0);
34644   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
34645
34646   if (CC == X86::COND_B) {
34647     // X + SETB Z --> X + (mask SBB Z, Z)
34648     // X - SETB Z --> X - (mask SBB Z, Z)
34649     // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
34650     SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
34651     if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34652       SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34653     return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34654   }
34655
34656   if (CC == X86::COND_A) {
34657     SDValue EFLAGS = Y->getOperand(1);
34658     // Try to convert COND_A into COND_B in an attempt to facilitate
34659     // materializing "setb reg".
34660     //
34661     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
34662     // cannot take an immediate as its first operand.
34663     //
34664     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
34665         EFLAGS.getValueType().isInteger() &&
34666         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
34667       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
34668                                    EFLAGS.getNode()->getVTList(),
34669                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
34670       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
34671       SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
34672       if (SBB.getValueSizeInBits() != VT.getSizeInBits())
34673         SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
34674       return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
34675     }
34676   }
34677
34678   if (CC != X86::COND_E && CC != X86::COND_NE)
34679     return SDValue();
34680
34681   SDValue Cmp = Y.getOperand(1);
34682   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
34683       !X86::isZeroNode(Cmp.getOperand(1)) ||
34684       !Cmp.getOperand(0).getValueType().isInteger())
34685     return SDValue();
34686
34687   // (cmp Z, 1) sets the carry flag if Z is 0.
34688   SDValue Z = Cmp.getOperand(0);
34689   SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
34690                                DAG.getConstant(1, DL, Z.getValueType()));
34691
34692   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
34693
34694   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
34695   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
34696   if (CC == X86::COND_NE)
34697     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
34698                        DAG.getConstant(-1ULL, DL, VT), NewCmp);
34699
34700   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
34701   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
34702   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
34703                      DAG.getConstant(0, DL, VT), NewCmp);
34704 }
34705
34706 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
34707                                       const X86Subtarget &Subtarget) {
34708   SDValue MulOp = N->getOperand(0);
34709   SDValue Phi = N->getOperand(1);
34710
34711   if (MulOp.getOpcode() != ISD::MUL)
34712     std::swap(MulOp, Phi);
34713   if (MulOp.getOpcode() != ISD::MUL)
34714     return SDValue();
34715
34716   ShrinkMode Mode;
34717   if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
34718     return SDValue();
34719
34720   EVT VT = N->getValueType(0);
34721
34722   unsigned RegSize = 128;
34723   if (Subtarget.hasBWI())
34724     RegSize = 512;
34725   else if (Subtarget.hasAVX2())
34726     RegSize = 256;
34727   unsigned VectorSize = VT.getVectorNumElements() * 16;
34728   // If the vector size is less than 128, or greater than the supported RegSize,
34729   // do not use PMADD.
34730   if (VectorSize < 128 || VectorSize > RegSize)
34731     return SDValue();
34732
34733   SDLoc DL(N);
34734   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34735                                    VT.getVectorNumElements());
34736   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34737                                 VT.getVectorNumElements() / 2);
34738
34739   // Shrink the operands of mul.
34740   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
34741   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
34742
34743   // Madd vector size is half of the original vector size
34744   SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
34745   // Fill the rest of the output with 0
34746   SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
34747   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
34748   return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
34749 }
34750
34751 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
34752                                      const X86Subtarget &Subtarget) {
34753   SDLoc DL(N);
34754   EVT VT = N->getValueType(0);
34755   SDValue Op0 = N->getOperand(0);
34756   SDValue Op1 = N->getOperand(1);
34757
34758   // TODO: There's nothing special about i32, any integer type above i16 should
34759   // work just as well.
34760   if (!VT.isVector() || !VT.isSimple() ||
34761       !(VT.getVectorElementType() == MVT::i32))
34762     return SDValue();
34763
34764   unsigned RegSize = 128;
34765   if (Subtarget.hasBWI())
34766     RegSize = 512;
34767   else if (Subtarget.hasAVX2())
34768     RegSize = 256;
34769
34770   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
34771   // TODO: We should be able to handle larger vectors by splitting them before
34772   // feeding them into several SADs, and then reducing over those.
34773   if (VT.getSizeInBits() / 4 > RegSize)
34774     return SDValue();
34775
34776   // We know N is a reduction add, which means one of its operands is a phi.
34777   // To match SAD, we need the other operand to be a vector select.
34778   SDValue SelectOp, Phi;
34779   if (Op0.getOpcode() == ISD::VSELECT) {
34780     SelectOp = Op0;
34781     Phi = Op1;
34782   } else if (Op1.getOpcode() == ISD::VSELECT) {
34783     SelectOp = Op1;
34784     Phi = Op0;
34785   } else
34786     return SDValue();
34787
34788   // Check whether we have an abs-diff pattern feeding into the select.
34789   if(!detectZextAbsDiff(SelectOp, Op0, Op1))
34790     return SDValue();
34791
34792   // SAD pattern detected. Now build a SAD instruction and an addition for
34793   // reduction. Note that the number of elements of the result of SAD is less
34794   // than the number of elements of its input. Therefore, we could only update
34795   // part of elements in the reduction vector.
34796   SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
34797
34798   // The output of PSADBW is a vector of i64.
34799   // We need to turn the vector of i64 into a vector of i32.
34800   // If the reduction vector is at least as wide as the psadbw result, just
34801   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
34802   // anyway.
34803   MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
34804   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
34805     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
34806   else
34807     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
34808
34809   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
34810     // Update part of elements of the reduction vector. This is done by first
34811     // extracting a sub-vector from it, updating this sub-vector, and inserting
34812     // it back.
34813     SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
34814                                  DAG.getIntPtrConstant(0, DL));
34815     SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
34816     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
34817                        DAG.getIntPtrConstant(0, DL));
34818   } else
34819     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
34820 }
34821
34822 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
34823                           const X86Subtarget &Subtarget) {
34824   const SDNodeFlags Flags = N->getFlags();
34825   if (Flags.hasVectorReduction()) {
34826     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
34827       return Sad;
34828     if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
34829       return MAdd;
34830   }
34831   EVT VT = N->getValueType(0);
34832   SDValue Op0 = N->getOperand(0);
34833   SDValue Op1 = N->getOperand(1);
34834
34835   // Try to synthesize horizontal adds from adds of shuffles.
34836   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34837        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34838       isHorizontalBinOp(Op0, Op1, true))
34839     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
34840
34841   return combineAddOrSubToADCOrSBB(N, DAG);
34842 }
34843
34844 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
34845                           const X86Subtarget &Subtarget) {
34846   SDValue Op0 = N->getOperand(0);
34847   SDValue Op1 = N->getOperand(1);
34848
34849   // X86 can't encode an immediate LHS of a sub. See if we can push the
34850   // negation into a preceding instruction.
34851   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
34852     // If the RHS of the sub is a XOR with one use and a constant, invert the
34853     // immediate. Then add one to the LHS of the sub so we can turn
34854     // X-Y -> X+~Y+1, saving one register.
34855     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
34856         isa<ConstantSDNode>(Op1.getOperand(1))) {
34857       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
34858       EVT VT = Op0.getValueType();
34859       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
34860                                    Op1.getOperand(0),
34861                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
34862       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
34863                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
34864     }
34865   }
34866
34867   // Try to synthesize horizontal subs from subs of shuffles.
34868   EVT VT = N->getValueType(0);
34869   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
34870        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
34871       isHorizontalBinOp(Op0, Op1, false))
34872     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
34873
34874   return combineAddOrSubToADCOrSBB(N, DAG);
34875 }
34876
34877 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
34878                              TargetLowering::DAGCombinerInfo &DCI,
34879                              const X86Subtarget &Subtarget) {
34880   if (DCI.isBeforeLegalize())
34881     return SDValue();
34882
34883   SDLoc DL(N);
34884   unsigned Opcode = N->getOpcode();
34885   MVT VT = N->getSimpleValueType(0);
34886   MVT SVT = VT.getVectorElementType();
34887   unsigned NumElts = VT.getVectorNumElements();
34888   unsigned EltSizeInBits = SVT.getSizeInBits();
34889
34890   SDValue Op = N->getOperand(0);
34891   MVT OpVT = Op.getSimpleValueType();
34892   MVT OpEltVT = OpVT.getVectorElementType();
34893   unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
34894   unsigned InputBits = OpEltSizeInBits * NumElts;
34895
34896   // Perform any constant folding.
34897   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
34898   APInt UndefElts;
34899   SmallVector<APInt, 64> EltBits;
34900   if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
34901     APInt Undefs(NumElts, 0);
34902     SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
34903     bool IsZEXT =
34904         (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
34905     for (unsigned i = 0; i != NumElts; ++i) {
34906       if (UndefElts[i]) {
34907         Undefs.setBit(i);
34908         continue;
34909       }
34910       Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
34911                        : EltBits[i].sextOrTrunc(EltSizeInBits);
34912     }
34913     return getConstVector(Vals, Undefs, VT, DAG, DL);
34914   }
34915
34916   // (vzext (bitcast (vzext (x)) -> (vzext x)
34917   // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
34918   SDValue V = peekThroughBitcasts(Op);
34919   if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
34920     MVT InnerVT = V.getSimpleValueType();
34921     MVT InnerEltVT = InnerVT.getVectorElementType();
34922
34923     // If the element sizes match exactly, we can just do one larger vzext. This
34924     // is always an exact type match as vzext operates on integer types.
34925     if (OpEltVT == InnerEltVT) {
34926       assert(OpVT == InnerVT && "Types must match for vzext!");
34927       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
34928     }
34929
34930     // The only other way we can combine them is if only a single element of the
34931     // inner vzext is used in the input to the outer vzext.
34932     if (InnerEltVT.getSizeInBits() < InputBits)
34933       return SDValue();
34934
34935     // In this case, the inner vzext is completely dead because we're going to
34936     // only look at bits inside of the low element. Just do the outer vzext on
34937     // a bitcast of the input to the inner.
34938     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
34939   }
34940
34941   // Check if we can bypass extracting and re-inserting an element of an input
34942   // vector. Essentially:
34943   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
34944   // TODO: Add X86ISD::VSEXT support
34945   if (Opcode == X86ISD::VZEXT &&
34946       V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
34947       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34948       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
34949     SDValue ExtractedV = V.getOperand(0);
34950     SDValue OrigV = ExtractedV.getOperand(0);
34951     if (isNullConstant(ExtractedV.getOperand(1))) {
34952         MVT OrigVT = OrigV.getSimpleValueType();
34953         // Extract a subvector if necessary...
34954         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
34955           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
34956           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
34957                                     OrigVT.getVectorNumElements() / Ratio);
34958           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
34959                               DAG.getIntPtrConstant(0, DL));
34960         }
34961         Op = DAG.getBitcast(OpVT, OrigV);
34962         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
34963       }
34964   }
34965
34966   return SDValue();
34967 }
34968
34969 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
34970 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
34971                                   const X86Subtarget &Subtarget) {
34972   SDValue Chain = N->getOperand(0);
34973   SDValue LHS = N->getOperand(1);
34974   SDValue RHS = N->getOperand(2);
34975   MVT VT = RHS.getSimpleValueType();
34976   SDLoc DL(N);
34977
34978   auto *C = dyn_cast<ConstantSDNode>(RHS);
34979   if (!C || C->getZExtValue() != 1)
34980     return SDValue();
34981
34982   RHS = DAG.getConstant(-1, DL, VT);
34983   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
34984   return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
34985                                  DAG.getVTList(MVT::i32, MVT::Other),
34986                                  {Chain, LHS, RHS}, VT, MMO);
34987 }
34988
34989 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
34990 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
34991   SDValue Op0 = N->getOperand(0);
34992   SDValue Op1 = N->getOperand(1);
34993
34994   if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
34995     return SDValue();
34996
34997   EVT VT = N->getValueType(0);
34998   SDLoc DL(N);
34999
35000   return DAG.getNode(X86ISD::TESTM, DL, VT,
35001                      Op0->getOperand(0), Op0->getOperand(1));
35002 }
35003
35004 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35005                                     const X86Subtarget &Subtarget) {
35006   MVT VT = N->getSimpleValueType(0);
35007   SDLoc DL(N);
35008
35009   if (N->getOperand(0) == N->getOperand(1)) {
35010     if (N->getOpcode() == X86ISD::PCMPEQ)
35011       return getOnesVector(VT, DAG, DL);
35012     if (N->getOpcode() == X86ISD::PCMPGT)
35013       return getZeroVector(VT, Subtarget, DAG, DL);
35014   }
35015
35016   return SDValue();
35017 }
35018
35019 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35020                                       TargetLowering::DAGCombinerInfo &DCI,
35021                                       const X86Subtarget &Subtarget) {
35022   if (DCI.isBeforeLegalizeOps())
35023     return SDValue();
35024
35025   SDLoc dl(N);
35026   SDValue Vec = N->getOperand(0);
35027   SDValue SubVec = N->getOperand(1);
35028   SDValue Idx = N->getOperand(2);
35029
35030   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35031   MVT OpVT = N->getSimpleValueType(0);
35032   MVT SubVecVT = SubVec.getSimpleValueType();
35033
35034   // If this is an insert of an extract, combine to a shuffle. Don't do this
35035   // if the insert or extract can be represented with a subvector operation.
35036   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35037       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35038       (IdxVal != 0 || !Vec.isUndef())) {
35039     int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35040     if (ExtIdxVal != 0) {
35041       int VecNumElts = OpVT.getVectorNumElements();
35042       int SubVecNumElts = SubVecVT.getVectorNumElements();
35043       SmallVector<int, 64> Mask(VecNumElts);
35044       // First create an identity shuffle mask.
35045       for (int i = 0; i != VecNumElts; ++i)
35046         Mask[i] = i;
35047       // Now insert the extracted portion.
35048       for (int i = 0; i != SubVecNumElts; ++i)
35049         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35050
35051       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35052     }
35053   }
35054
35055   // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35056   // load:
35057   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35058   //                   (load16 addr + 16), Elts/2)
35059   // --> load32 addr
35060   // or:
35061   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35062   //                   (load32 addr + 32), Elts/2)
35063   // --> load64 addr
35064   // or a 16-byte or 32-byte broadcast:
35065   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35066   //                   (load16 addr), Elts/2)
35067   // --> X86SubVBroadcast(load16 addr)
35068   // or:
35069   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35070   //                   (load32 addr), Elts/2)
35071   // --> X86SubVBroadcast(load32 addr)
35072   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35073       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35074       OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35075     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35076     if (Idx2 && Idx2->getZExtValue() == 0) {
35077       SDValue SubVec2 = Vec.getOperand(1);
35078       // If needed, look through bitcasts to get to the load.
35079       if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35080         bool Fast;
35081         unsigned Alignment = FirstLd->getAlignment();
35082         unsigned AS = FirstLd->getAddressSpace();
35083         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35084         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35085                                     OpVT, AS, Alignment, &Fast) && Fast) {
35086           SDValue Ops[] = {SubVec2, SubVec};
35087           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
35088             return Ld;
35089         }
35090       }
35091       // If lower/upper loads are the same and the only users of the load, then
35092       // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35093       if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35094         if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35095             SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35096           return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35097         }
35098       }
35099       // If this is subv_broadcast insert into both halves, use a larger
35100       // subv_broadcast.
35101       if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35102         return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35103                            SubVec.getOperand(0));
35104       }
35105     }
35106   }
35107
35108   return SDValue();
35109 }
35110
35111
35112 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35113                                              DAGCombinerInfo &DCI) const {
35114   SelectionDAG &DAG = DCI.DAG;
35115   switch (N->getOpcode()) {
35116   default: break;
35117   case ISD::EXTRACT_VECTOR_ELT:
35118     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35119   case X86ISD::PEXTRW:
35120   case X86ISD::PEXTRB:
35121     return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35122   case ISD::INSERT_SUBVECTOR:
35123     return combineInsertSubvector(N, DAG, DCI, Subtarget);
35124   case ISD::VSELECT:
35125   case ISD::SELECT:
35126   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35127   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
35128   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
35129   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
35130   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
35131   case X86ISD::ADD:         return combineX86ADD(N, DAG, DCI);
35132   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
35133   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
35134   case ISD::SHL:
35135   case ISD::SRA:
35136   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
35137   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
35138   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
35139   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
35140   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
35141   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
35142   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
35143   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
35144   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
35145   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
35146   case ISD::FADD:
35147   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
35148   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
35149   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
35150   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
35151   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
35152   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
35153   case X86ISD::FXOR:
35154   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
35155   case X86ISD::FMIN:
35156   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
35157   case ISD::FMINNUM:
35158   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
35159   case X86ISD::BT:          return combineBT(N, DAG, DCI);
35160   case ISD::ANY_EXTEND:
35161   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
35162   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
35163   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35164   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
35165   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
35166   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
35167   case X86ISD::VSHLI:
35168   case X86ISD::VSRAI:
35169   case X86ISD::VSRLI:
35170     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35171   case ISD::SIGN_EXTEND_VECTOR_INREG:
35172   case ISD::ZERO_EXTEND_VECTOR_INREG:
35173   case X86ISD::VSEXT:
35174   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
35175   case X86ISD::PINSRB:
35176   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
35177   case X86ISD::SHUFP:       // Handle all target specific shuffles
35178   case X86ISD::INSERTPS:
35179   case X86ISD::PALIGNR:
35180   case X86ISD::VSHLDQ:
35181   case X86ISD::VSRLDQ:
35182   case X86ISD::BLENDI:
35183   case X86ISD::UNPCKH:
35184   case X86ISD::UNPCKL:
35185   case X86ISD::MOVHLPS:
35186   case X86ISD::MOVLHPS:
35187   case X86ISD::PSHUFB:
35188   case X86ISD::PSHUFD:
35189   case X86ISD::PSHUFHW:
35190   case X86ISD::PSHUFLW:
35191   case X86ISD::MOVSHDUP:
35192   case X86ISD::MOVSLDUP:
35193   case X86ISD::MOVDDUP:
35194   case X86ISD::MOVSS:
35195   case X86ISD::MOVSD:
35196   case X86ISD::VPPERM:
35197   case X86ISD::VPERMI:
35198   case X86ISD::VPERMV:
35199   case X86ISD::VPERMV3:
35200   case X86ISD::VPERMIV3:
35201   case X86ISD::VPERMIL2:
35202   case X86ISD::VPERMILPI:
35203   case X86ISD::VPERMILPV:
35204   case X86ISD::VPERM2X128:
35205   case X86ISD::VZEXT_MOVL:
35206   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35207   case X86ISD::FMADD:
35208   case X86ISD::FMADD_RND:
35209   case X86ISD::FMADDS1_RND:
35210   case X86ISD::FMADDS3_RND:
35211   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
35212   case ISD::MGATHER:
35213   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
35214   case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
35215   case X86ISD::TESTM:       return combineTestM(N, DAG);
35216   case X86ISD::PCMPEQ:
35217   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
35218   }
35219
35220   return SDValue();
35221 }
35222
35223 /// Return true if the target has native support for the specified value type
35224 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35225 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35226 /// some i16 instructions are slow.
35227 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35228   if (!isTypeLegal(VT))
35229     return false;
35230   if (VT != MVT::i16)
35231     return true;
35232
35233   switch (Opc) {
35234   default:
35235     return true;
35236   case ISD::LOAD:
35237   case ISD::SIGN_EXTEND:
35238   case ISD::ZERO_EXTEND:
35239   case ISD::ANY_EXTEND:
35240   case ISD::SHL:
35241   case ISD::SRL:
35242   case ISD::SUB:
35243   case ISD::ADD:
35244   case ISD::MUL:
35245   case ISD::AND:
35246   case ISD::OR:
35247   case ISD::XOR:
35248     return false;
35249   }
35250 }
35251
35252 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35253 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35254 /// we don't adjust the stack we clobber the first frame index.
35255 /// See X86InstrInfo::copyPhysReg.
35256 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35257   const MachineRegisterInfo &MRI = MF.getRegInfo();
35258   return any_of(MRI.reg_instructions(X86::EFLAGS),
35259                 [](const MachineInstr &RI) { return RI.isCopy(); });
35260 }
35261
35262 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35263   if (hasCopyImplyingStackAdjustment(MF)) {
35264     MachineFrameInfo &MFI = MF.getFrameInfo();
35265     MFI.setHasCopyImplyingStackAdjustment(true);
35266   }
35267
35268   TargetLoweringBase::finalizeLowering(MF);
35269 }
35270
35271 /// This method query the target whether it is beneficial for dag combiner to
35272 /// promote the specified node. If true, it should return the desired promotion
35273 /// type by reference.
35274 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35275   EVT VT = Op.getValueType();
35276   if (VT != MVT::i16)
35277     return false;
35278
35279   bool Promote = false;
35280   bool Commute = false;
35281   switch (Op.getOpcode()) {
35282   default: break;
35283   case ISD::SIGN_EXTEND:
35284   case ISD::ZERO_EXTEND:
35285   case ISD::ANY_EXTEND:
35286     Promote = true;
35287     break;
35288   case ISD::SHL:
35289   case ISD::SRL: {
35290     SDValue N0 = Op.getOperand(0);
35291     // Look out for (store (shl (load), x)).
35292     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35293       return false;
35294     Promote = true;
35295     break;
35296   }
35297   case ISD::ADD:
35298   case ISD::MUL:
35299   case ISD::AND:
35300   case ISD::OR:
35301   case ISD::XOR:
35302     Commute = true;
35303     LLVM_FALLTHROUGH;
35304   case ISD::SUB: {
35305     SDValue N0 = Op.getOperand(0);
35306     SDValue N1 = Op.getOperand(1);
35307     if (!Commute && MayFoldLoad(N1))
35308       return false;
35309     // Avoid disabling potential load folding opportunities.
35310     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35311       return false;
35312     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35313       return false;
35314     Promote = true;
35315   }
35316   }
35317
35318   PVT = MVT::i32;
35319   return Promote;
35320 }
35321
35322 //===----------------------------------------------------------------------===//
35323 //                           X86 Inline Assembly Support
35324 //===----------------------------------------------------------------------===//
35325
35326 // Helper to match a string separated by whitespace.
35327 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35328   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35329
35330   for (StringRef Piece : Pieces) {
35331     if (!S.startswith(Piece)) // Check if the piece matches.
35332       return false;
35333
35334     S = S.substr(Piece.size());
35335     StringRef::size_type Pos = S.find_first_not_of(" \t");
35336     if (Pos == 0) // We matched a prefix.
35337       return false;
35338
35339     S = S.substr(Pos);
35340   }
35341
35342   return S.empty();
35343 }
35344
35345 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35346
35347   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35348     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35349         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35350         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35351
35352       if (AsmPieces.size() == 3)
35353         return true;
35354       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35355         return true;
35356     }
35357   }
35358   return false;
35359 }
35360
35361 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35362   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35363
35364   const std::string &AsmStr = IA->getAsmString();
35365
35366   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35367   if (!Ty || Ty->getBitWidth() % 16 != 0)
35368     return false;
35369
35370   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35371   SmallVector<StringRef, 4> AsmPieces;
35372   SplitString(AsmStr, AsmPieces, ";\n");
35373
35374   switch (AsmPieces.size()) {
35375   default: return false;
35376   case 1:
35377     // FIXME: this should verify that we are targeting a 486 or better.  If not,
35378     // we will turn this bswap into something that will be lowered to logical
35379     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
35380     // lower so don't worry about this.
35381     // bswap $0
35382     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35383         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35384         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35385         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35386         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35387         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35388       // No need to check constraints, nothing other than the equivalent of
35389       // "=r,0" would be valid here.
35390       return IntrinsicLowering::LowerToByteSwap(CI);
35391     }
35392
35393     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
35394     if (CI->getType()->isIntegerTy(16) &&
35395         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35396         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35397          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35398       AsmPieces.clear();
35399       StringRef ConstraintsStr = IA->getConstraintString();
35400       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35401       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35402       if (clobbersFlagRegisters(AsmPieces))
35403         return IntrinsicLowering::LowerToByteSwap(CI);
35404     }
35405     break;
35406   case 3:
35407     if (CI->getType()->isIntegerTy(32) &&
35408         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35409         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35410         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35411         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35412       AsmPieces.clear();
35413       StringRef ConstraintsStr = IA->getConstraintString();
35414       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35415       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35416       if (clobbersFlagRegisters(AsmPieces))
35417         return IntrinsicLowering::LowerToByteSwap(CI);
35418     }
35419
35420     if (CI->getType()->isIntegerTy(64)) {
35421       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35422       if (Constraints.size() >= 2 &&
35423           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35424           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35425         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
35426         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35427             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35428             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35429           return IntrinsicLowering::LowerToByteSwap(CI);
35430       }
35431     }
35432     break;
35433   }
35434   return false;
35435 }
35436
35437 /// Given a constraint letter, return the type of constraint for this target.
35438 X86TargetLowering::ConstraintType
35439 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35440   if (Constraint.size() == 1) {
35441     switch (Constraint[0]) {
35442     case 'R':
35443     case 'q':
35444     case 'Q':
35445     case 'f':
35446     case 't':
35447     case 'u':
35448     case 'y':
35449     case 'x':
35450     case 'v':
35451     case 'Y':
35452     case 'l':
35453       return C_RegisterClass;
35454     case 'k': // AVX512 masking registers.
35455     case 'a':
35456     case 'b':
35457     case 'c':
35458     case 'd':
35459     case 'S':
35460     case 'D':
35461     case 'A':
35462       return C_Register;
35463     case 'I':
35464     case 'J':
35465     case 'K':
35466     case 'L':
35467     case 'M':
35468     case 'N':
35469     case 'G':
35470     case 'C':
35471     case 'e':
35472     case 'Z':
35473       return C_Other;
35474     default:
35475       break;
35476     }
35477   }
35478   else if (Constraint.size() == 2) {
35479     switch (Constraint[0]) {
35480     default:
35481       break;
35482     case 'Y':
35483       switch (Constraint[1]) {
35484       default:
35485         break;
35486       case 'k':
35487         return C_Register;
35488       }
35489     }
35490   }
35491   return TargetLowering::getConstraintType(Constraint);
35492 }
35493
35494 /// Examine constraint type and operand type and determine a weight value.
35495 /// This object must already have been set up with the operand type
35496 /// and the current alternative constraint selected.
35497 TargetLowering::ConstraintWeight
35498   X86TargetLowering::getSingleConstraintMatchWeight(
35499     AsmOperandInfo &info, const char *constraint) const {
35500   ConstraintWeight weight = CW_Invalid;
35501   Value *CallOperandVal = info.CallOperandVal;
35502     // If we don't have a value, we can't do a match,
35503     // but allow it at the lowest weight.
35504   if (!CallOperandVal)
35505     return CW_Default;
35506   Type *type = CallOperandVal->getType();
35507   // Look at the constraint type.
35508   switch (*constraint) {
35509   default:
35510     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
35511     LLVM_FALLTHROUGH;
35512   case 'R':
35513   case 'q':
35514   case 'Q':
35515   case 'a':
35516   case 'b':
35517   case 'c':
35518   case 'd':
35519   case 'S':
35520   case 'D':
35521   case 'A':
35522     if (CallOperandVal->getType()->isIntegerTy())
35523       weight = CW_SpecificReg;
35524     break;
35525   case 'f':
35526   case 't':
35527   case 'u':
35528     if (type->isFloatingPointTy())
35529       weight = CW_SpecificReg;
35530     break;
35531   case 'y':
35532     if (type->isX86_MMXTy() && Subtarget.hasMMX())
35533       weight = CW_SpecificReg;
35534     break;
35535   case 'Y':
35536     // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
35537     if (constraint[1] == 'k') {
35538       // Support for 'Yk' (similarly to the 'k' variant below).
35539       weight = CW_SpecificReg;
35540       break;
35541     }
35542   // Else fall through (handle "Y" constraint).
35543     LLVM_FALLTHROUGH;
35544   case 'v':
35545     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
35546       weight = CW_Register;
35547     LLVM_FALLTHROUGH;
35548   case 'x':
35549     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
35550         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
35551       weight = CW_Register;
35552     break;
35553   case 'k':
35554     // Enable conditional vector operations using %k<#> registers.
35555     weight = CW_SpecificReg;
35556     break;
35557   case 'I':
35558     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
35559       if (C->getZExtValue() <= 31)
35560         weight = CW_Constant;
35561     }
35562     break;
35563   case 'J':
35564     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35565       if (C->getZExtValue() <= 63)
35566         weight = CW_Constant;
35567     }
35568     break;
35569   case 'K':
35570     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35571       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
35572         weight = CW_Constant;
35573     }
35574     break;
35575   case 'L':
35576     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35577       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
35578         weight = CW_Constant;
35579     }
35580     break;
35581   case 'M':
35582     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35583       if (C->getZExtValue() <= 3)
35584         weight = CW_Constant;
35585     }
35586     break;
35587   case 'N':
35588     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35589       if (C->getZExtValue() <= 0xff)
35590         weight = CW_Constant;
35591     }
35592     break;
35593   case 'G':
35594   case 'C':
35595     if (isa<ConstantFP>(CallOperandVal)) {
35596       weight = CW_Constant;
35597     }
35598     break;
35599   case 'e':
35600     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35601       if ((C->getSExtValue() >= -0x80000000LL) &&
35602           (C->getSExtValue() <= 0x7fffffffLL))
35603         weight = CW_Constant;
35604     }
35605     break;
35606   case 'Z':
35607     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
35608       if (C->getZExtValue() <= 0xffffffff)
35609         weight = CW_Constant;
35610     }
35611     break;
35612   }
35613   return weight;
35614 }
35615
35616 /// Try to replace an X constraint, which matches anything, with another that
35617 /// has more specific requirements based on the type of the corresponding
35618 /// operand.
35619 const char *X86TargetLowering::
35620 LowerXConstraint(EVT ConstraintVT) const {
35621   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
35622   // 'f' like normal targets.
35623   if (ConstraintVT.isFloatingPoint()) {
35624     if (Subtarget.hasSSE2())
35625       return "Y";
35626     if (Subtarget.hasSSE1())
35627       return "x";
35628   }
35629
35630   return TargetLowering::LowerXConstraint(ConstraintVT);
35631 }
35632
35633 /// Lower the specified operand into the Ops vector.
35634 /// If it is invalid, don't add anything to Ops.
35635 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
35636                                                      std::string &Constraint,
35637                                                      std::vector<SDValue>&Ops,
35638                                                      SelectionDAG &DAG) const {
35639   SDValue Result;
35640
35641   // Only support length 1 constraints for now.
35642   if (Constraint.length() > 1) return;
35643
35644   char ConstraintLetter = Constraint[0];
35645   switch (ConstraintLetter) {
35646   default: break;
35647   case 'I':
35648     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35649       if (C->getZExtValue() <= 31) {
35650         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35651                                        Op.getValueType());
35652         break;
35653       }
35654     }
35655     return;
35656   case 'J':
35657     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35658       if (C->getZExtValue() <= 63) {
35659         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35660                                        Op.getValueType());
35661         break;
35662       }
35663     }
35664     return;
35665   case 'K':
35666     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35667       if (isInt<8>(C->getSExtValue())) {
35668         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35669                                        Op.getValueType());
35670         break;
35671       }
35672     }
35673     return;
35674   case 'L':
35675     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35676       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
35677           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
35678         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
35679                                        Op.getValueType());
35680         break;
35681       }
35682     }
35683     return;
35684   case 'M':
35685     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35686       if (C->getZExtValue() <= 3) {
35687         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35688                                        Op.getValueType());
35689         break;
35690       }
35691     }
35692     return;
35693   case 'N':
35694     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35695       if (C->getZExtValue() <= 255) {
35696         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35697                                        Op.getValueType());
35698         break;
35699       }
35700     }
35701     return;
35702   case 'O':
35703     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35704       if (C->getZExtValue() <= 127) {
35705         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35706                                        Op.getValueType());
35707         break;
35708       }
35709     }
35710     return;
35711   case 'e': {
35712     // 32-bit signed value
35713     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35714       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35715                                            C->getSExtValue())) {
35716         // Widen to 64 bits here to get it sign extended.
35717         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
35718         break;
35719       }
35720     // FIXME gcc accepts some relocatable values here too, but only in certain
35721     // memory models; it's complicated.
35722     }
35723     return;
35724   }
35725   case 'Z': {
35726     // 32-bit unsigned value
35727     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
35728       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
35729                                            C->getZExtValue())) {
35730         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
35731                                        Op.getValueType());
35732         break;
35733       }
35734     }
35735     // FIXME gcc accepts some relocatable values here too, but only in certain
35736     // memory models; it's complicated.
35737     return;
35738   }
35739   case 'i': {
35740     // Literal immediates are always ok.
35741     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
35742       // Widen to 64 bits here to get it sign extended.
35743       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
35744       break;
35745     }
35746
35747     // In any sort of PIC mode addresses need to be computed at runtime by
35748     // adding in a register or some sort of table lookup.  These can't
35749     // be used as immediates.
35750     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
35751       return;
35752
35753     // If we are in non-pic codegen mode, we allow the address of a global (with
35754     // an optional displacement) to be used with 'i'.
35755     GlobalAddressSDNode *GA = nullptr;
35756     int64_t Offset = 0;
35757
35758     // Match either (GA), (GA+C), (GA+C1+C2), etc.
35759     while (1) {
35760       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
35761         Offset += GA->getOffset();
35762         break;
35763       } else if (Op.getOpcode() == ISD::ADD) {
35764         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35765           Offset += C->getZExtValue();
35766           Op = Op.getOperand(0);
35767           continue;
35768         }
35769       } else if (Op.getOpcode() == ISD::SUB) {
35770         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
35771           Offset += -C->getZExtValue();
35772           Op = Op.getOperand(0);
35773           continue;
35774         }
35775       }
35776
35777       // Otherwise, this isn't something we can handle, reject it.
35778       return;
35779     }
35780
35781     const GlobalValue *GV = GA->getGlobal();
35782     // If we require an extra load to get this address, as in PIC mode, we
35783     // can't accept it.
35784     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
35785       return;
35786
35787     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
35788                                         GA->getValueType(0), Offset);
35789     break;
35790   }
35791   }
35792
35793   if (Result.getNode()) {
35794     Ops.push_back(Result);
35795     return;
35796   }
35797   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
35798 }
35799
35800 /// Check if \p RC is a general purpose register class.
35801 /// I.e., GR* or one of their variant.
35802 static bool isGRClass(const TargetRegisterClass &RC) {
35803   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
35804          RC.hasSuperClassEq(&X86::GR16RegClass) ||
35805          RC.hasSuperClassEq(&X86::GR32RegClass) ||
35806          RC.hasSuperClassEq(&X86::GR64RegClass) ||
35807          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
35808 }
35809
35810 /// Check if \p RC is a vector register class.
35811 /// I.e., FR* / VR* or one of their variant.
35812 static bool isFRClass(const TargetRegisterClass &RC) {
35813   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
35814          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
35815          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
35816          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
35817          RC.hasSuperClassEq(&X86::VR512RegClass);
35818 }
35819
35820 std::pair<unsigned, const TargetRegisterClass *>
35821 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
35822                                                 StringRef Constraint,
35823                                                 MVT VT) const {
35824   // First, see if this is a constraint that directly corresponds to an LLVM
35825   // register class.
35826   if (Constraint.size() == 1) {
35827     // GCC Constraint Letters
35828     switch (Constraint[0]) {
35829     default: break;
35830       // TODO: Slight differences here in allocation order and leaving
35831       // RIP in the class. Do they matter any more here than they do
35832       // in the normal allocation?
35833     case 'k':
35834       if (Subtarget.hasAVX512()) {
35835         //  Only supported in AVX512 or later.
35836         switch (VT.SimpleTy) {
35837         default: break;
35838         case MVT::i32:
35839           return std::make_pair(0U, &X86::VK32RegClass);
35840         case MVT::i16:
35841           return std::make_pair(0U, &X86::VK16RegClass);
35842         case MVT::i8:
35843           return std::make_pair(0U, &X86::VK8RegClass);
35844         case MVT::i1:
35845           return std::make_pair(0U, &X86::VK1RegClass);
35846         case MVT::i64:
35847           return std::make_pair(0U, &X86::VK64RegClass);
35848         }
35849       }
35850       break;
35851     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
35852       if (Subtarget.is64Bit()) {
35853         if (VT == MVT::i32 || VT == MVT::f32)
35854           return std::make_pair(0U, &X86::GR32RegClass);
35855         if (VT == MVT::i16)
35856           return std::make_pair(0U, &X86::GR16RegClass);
35857         if (VT == MVT::i8 || VT == MVT::i1)
35858           return std::make_pair(0U, &X86::GR8RegClass);
35859         if (VT == MVT::i64 || VT == MVT::f64)
35860           return std::make_pair(0U, &X86::GR64RegClass);
35861         break;
35862       }
35863       LLVM_FALLTHROUGH;
35864       // 32-bit fallthrough
35865     case 'Q':   // Q_REGS
35866       if (VT == MVT::i32 || VT == MVT::f32)
35867         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
35868       if (VT == MVT::i16)
35869         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
35870       if (VT == MVT::i8 || VT == MVT::i1)
35871         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
35872       if (VT == MVT::i64)
35873         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
35874       break;
35875     case 'r':   // GENERAL_REGS
35876     case 'l':   // INDEX_REGS
35877       if (VT == MVT::i8 || VT == MVT::i1)
35878         return std::make_pair(0U, &X86::GR8RegClass);
35879       if (VT == MVT::i16)
35880         return std::make_pair(0U, &X86::GR16RegClass);
35881       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
35882         return std::make_pair(0U, &X86::GR32RegClass);
35883       return std::make_pair(0U, &X86::GR64RegClass);
35884     case 'R':   // LEGACY_REGS
35885       if (VT == MVT::i8 || VT == MVT::i1)
35886         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
35887       if (VT == MVT::i16)
35888         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
35889       if (VT == MVT::i32 || !Subtarget.is64Bit())
35890         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
35891       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
35892     case 'f':  // FP Stack registers.
35893       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
35894       // value to the correct fpstack register class.
35895       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
35896         return std::make_pair(0U, &X86::RFP32RegClass);
35897       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
35898         return std::make_pair(0U, &X86::RFP64RegClass);
35899       return std::make_pair(0U, &X86::RFP80RegClass);
35900     case 'y':   // MMX_REGS if MMX allowed.
35901       if (!Subtarget.hasMMX()) break;
35902       return std::make_pair(0U, &X86::VR64RegClass);
35903     case 'Y':   // SSE_REGS if SSE2 allowed
35904       if (!Subtarget.hasSSE2()) break;
35905       LLVM_FALLTHROUGH;
35906     case 'v':
35907     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
35908       if (!Subtarget.hasSSE1()) break;
35909       bool VConstraint = (Constraint[0] == 'v');
35910
35911       switch (VT.SimpleTy) {
35912       default: break;
35913       // Scalar SSE types.
35914       case MVT::f32:
35915       case MVT::i32:
35916         if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
35917           return std::make_pair(0U, &X86::FR32XRegClass);
35918         return std::make_pair(0U, &X86::FR32RegClass);
35919       case MVT::f64:
35920       case MVT::i64:
35921         if (VConstraint && Subtarget.hasVLX())
35922           return std::make_pair(0U, &X86::FR64XRegClass);
35923         return std::make_pair(0U, &X86::FR64RegClass);
35924       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
35925       // Vector types.
35926       case MVT::v16i8:
35927       case MVT::v8i16:
35928       case MVT::v4i32:
35929       case MVT::v2i64:
35930       case MVT::v4f32:
35931       case MVT::v2f64:
35932         if (VConstraint && Subtarget.hasVLX())
35933           return std::make_pair(0U, &X86::VR128XRegClass);
35934         return std::make_pair(0U, &X86::VR128RegClass);
35935       // AVX types.
35936       case MVT::v32i8:
35937       case MVT::v16i16:
35938       case MVT::v8i32:
35939       case MVT::v4i64:
35940       case MVT::v8f32:
35941       case MVT::v4f64:
35942         if (VConstraint && Subtarget.hasVLX())
35943           return std::make_pair(0U, &X86::VR256XRegClass);
35944         return std::make_pair(0U, &X86::VR256RegClass);
35945       case MVT::v8f64:
35946       case MVT::v16f32:
35947       case MVT::v16i32:
35948       case MVT::v8i64:
35949         return std::make_pair(0U, &X86::VR512RegClass);
35950       }
35951       break;
35952     }
35953   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
35954     switch (Constraint[1]) {
35955     default:
35956       break;
35957     case 'k':
35958       // This register class doesn't allocate k0 for masked vector operation.
35959       if (Subtarget.hasAVX512()) { // Only supported in AVX512.
35960         switch (VT.SimpleTy) {
35961         default: break;
35962         case MVT::i32:
35963           return std::make_pair(0U, &X86::VK32WMRegClass);
35964         case MVT::i16:
35965           return std::make_pair(0U, &X86::VK16WMRegClass);
35966         case MVT::i8:
35967           return std::make_pair(0U, &X86::VK8WMRegClass);
35968         case MVT::i1:
35969           return std::make_pair(0U, &X86::VK1WMRegClass);
35970         case MVT::i64:
35971           return std::make_pair(0U, &X86::VK64WMRegClass);
35972         }
35973       }
35974       break;
35975     }
35976   }
35977
35978   // Use the default implementation in TargetLowering to convert the register
35979   // constraint into a member of a register class.
35980   std::pair<unsigned, const TargetRegisterClass*> Res;
35981   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
35982
35983   // Not found as a standard register?
35984   if (!Res.second) {
35985     // Map st(0) -> st(7) -> ST0
35986     if (Constraint.size() == 7 && Constraint[0] == '{' &&
35987         tolower(Constraint[1]) == 's' &&
35988         tolower(Constraint[2]) == 't' &&
35989         Constraint[3] == '(' &&
35990         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
35991         Constraint[5] == ')' &&
35992         Constraint[6] == '}') {
35993
35994       Res.first = X86::FP0+Constraint[4]-'0';
35995       Res.second = &X86::RFP80RegClass;
35996       return Res;
35997     }
35998
35999     // GCC allows "st(0)" to be called just plain "st".
36000     if (StringRef("{st}").equals_lower(Constraint)) {
36001       Res.first = X86::FP0;
36002       Res.second = &X86::RFP80RegClass;
36003       return Res;
36004     }
36005
36006     // flags -> EFLAGS
36007     if (StringRef("{flags}").equals_lower(Constraint)) {
36008       Res.first = X86::EFLAGS;
36009       Res.second = &X86::CCRRegClass;
36010       return Res;
36011     }
36012
36013     // 'A' means [ER]AX + [ER]DX.
36014     if (Constraint == "A") {
36015       if (Subtarget.is64Bit()) {
36016         Res.first = X86::RAX;
36017         Res.second = &X86::GR64_ADRegClass;
36018       } else {
36019         assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36020                "Expecting 64, 32 or 16 bit subtarget");
36021         Res.first = X86::EAX;
36022         Res.second = &X86::GR32_ADRegClass;
36023       }
36024       return Res;
36025     }
36026     return Res;
36027   }
36028
36029   // Otherwise, check to see if this is a register class of the wrong value
36030   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36031   // turn into {ax},{dx}.
36032   // MVT::Other is used to specify clobber names.
36033   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36034     return Res;   // Correct type already, nothing to do.
36035
36036   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36037   // return "eax". This should even work for things like getting 64bit integer
36038   // registers when given an f64 type.
36039   const TargetRegisterClass *Class = Res.second;
36040   // The generic code will match the first register class that contains the
36041   // given register. Thus, based on the ordering of the tablegened file,
36042   // the "plain" GR classes might not come first.
36043   // Therefore, use a helper method.
36044   if (isGRClass(*Class)) {
36045     unsigned Size = VT.getSizeInBits();
36046     if (Size == 1) Size = 8;
36047     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36048     if (DestReg > 0) {
36049       Res.first = DestReg;
36050       Res.second = Size == 8 ? &X86::GR8RegClass
36051                  : Size == 16 ? &X86::GR16RegClass
36052                  : Size == 32 ? &X86::GR32RegClass
36053                  : &X86::GR64RegClass;
36054       assert(Res.second->contains(Res.first) && "Register in register class");
36055     } else {
36056       // No register found/type mismatch.
36057       Res.first = 0;
36058       Res.second = nullptr;
36059     }
36060   } else if (isFRClass(*Class)) {
36061     // Handle references to XMM physical registers that got mapped into the
36062     // wrong class.  This can happen with constraints like {xmm0} where the
36063     // target independent register mapper will just pick the first match it can
36064     // find, ignoring the required type.
36065
36066     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36067     if (VT == MVT::f32 || VT == MVT::i32)
36068       Res.second = &X86::FR32RegClass;
36069     else if (VT == MVT::f64 || VT == MVT::i64)
36070       Res.second = &X86::FR64RegClass;
36071     else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36072       Res.second = &X86::VR128RegClass;
36073     else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36074       Res.second = &X86::VR256RegClass;
36075     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36076       Res.second = &X86::VR512RegClass;
36077     else {
36078       // Type mismatch and not a clobber: Return an error;
36079       Res.first = 0;
36080       Res.second = nullptr;
36081     }
36082   }
36083
36084   return Res;
36085 }
36086
36087 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36088                                             const AddrMode &AM, Type *Ty,
36089                                             unsigned AS) const {
36090   // Scaling factors are not free at all.
36091   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36092   // will take 2 allocations in the out of order engine instead of 1
36093   // for plain addressing mode, i.e. inst (reg1).
36094   // E.g.,
36095   // vaddps (%rsi,%drx), %ymm0, %ymm1
36096   // Requires two allocations (one for the load, one for the computation)
36097   // whereas:
36098   // vaddps (%rsi), %ymm0, %ymm1
36099   // Requires just 1 allocation, i.e., freeing allocations for other operations
36100   // and having less micro operations to execute.
36101   //
36102   // For some X86 architectures, this is even worse because for instance for
36103   // stores, the complex addressing mode forces the instruction to use the
36104   // "load" ports instead of the dedicated "store" port.
36105   // E.g., on Haswell:
36106   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36107   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36108   if (isLegalAddressingMode(DL, AM, Ty, AS))
36109     // Scale represents reg2 * scale, thus account for 1
36110     // as soon as we use a second register.
36111     return AM.Scale != 0;
36112   return -1;
36113 }
36114
36115 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36116   // Integer division on x86 is expensive. However, when aggressively optimizing
36117   // for code size, we prefer to use a div instruction, as it is usually smaller
36118   // than the alternative sequence.
36119   // The exception to this is vector division. Since x86 doesn't have vector
36120   // integer division, leaving the division as-is is a loss even in terms of
36121   // size, because it will have to be scalarized, while the alternative code
36122   // sequence can be performed in vector form.
36123   bool OptSize =
36124       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36125   return OptSize && !VT.isVector();
36126 }
36127
36128 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36129   if (!Subtarget.is64Bit())
36130     return;
36131
36132   // Update IsSplitCSR in X86MachineFunctionInfo.
36133   X86MachineFunctionInfo *AFI =
36134     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36135   AFI->setIsSplitCSR(true);
36136 }
36137
36138 void X86TargetLowering::insertCopiesSplitCSR(
36139     MachineBasicBlock *Entry,
36140     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36141   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36142   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36143   if (!IStart)
36144     return;
36145
36146   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36147   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36148   MachineBasicBlock::iterator MBBI = Entry->begin();
36149   for (const MCPhysReg *I = IStart; *I; ++I) {
36150     const TargetRegisterClass *RC = nullptr;
36151     if (X86::GR64RegClass.contains(*I))
36152       RC = &X86::GR64RegClass;
36153     else
36154       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36155
36156     unsigned NewVR = MRI->createVirtualRegister(RC);
36157     // Create copy from CSR to a virtual register.
36158     // FIXME: this currently does not emit CFI pseudo-instructions, it works
36159     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36160     // nounwind. If we want to generalize this later, we may need to emit
36161     // CFI pseudo-instructions.
36162     assert(Entry->getParent()->getFunction()->hasFnAttribute(
36163                Attribute::NoUnwind) &&
36164            "Function should be nounwind in insertCopiesSplitCSR!");
36165     Entry->addLiveIn(*I);
36166     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36167         .addReg(*I);
36168
36169     // Insert the copy-back instructions right before the terminator.
36170     for (auto *Exit : Exits)
36171       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36172               TII->get(TargetOpcode::COPY), *I)
36173           .addReg(NewVR);
36174   }
36175 }
36176
36177 bool X86TargetLowering::supportSwiftError() const {
36178   return Subtarget.is64Bit();
36179 }