contrib/llvm/lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86IntrinsicsInfo.h"
  21 #include "X86MachineFunctionInfo.h"
  22 #include "X86ShuffleDecodeConstantPool.h"
  23 #include "X86TargetMachine.h"
  24 #include "X86TargetObjectFile.h"
  25 #include "llvm/ADT/SmallBitVector.h"
  26 #include "llvm/ADT/SmallSet.h"
  27 #include "llvm/ADT/Statistic.h"
  28 #include "llvm/ADT/StringExtras.h"
  29 #include "llvm/ADT/StringSwitch.h"
  30 #include "llvm/Analysis/EHPersonalities.h"
  31 #include "llvm/CodeGen/IntrinsicLowering.h"
  32 #include "llvm/CodeGen/MachineFrameInfo.h"
  33 #include "llvm/CodeGen/MachineFunction.h"
  34 #include "llvm/CodeGen/MachineInstrBuilder.h"
  35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  36 #include "llvm/CodeGen/MachineModuleInfo.h"
  37 #include "llvm/CodeGen/MachineRegisterInfo.h"
  38 #include "llvm/CodeGen/WinEHFuncInfo.h"
  39 #include "llvm/IR/CallSite.h"
  40 #include "llvm/IR/CallingConv.h"
  41 #include "llvm/IR/Constants.h"
  42 #include "llvm/IR/DerivedTypes.h"
  43 #include "llvm/IR/DiagnosticInfo.h"
  44 #include "llvm/IR/Function.h"
  45 #include "llvm/IR/GlobalAlias.h"
  46 #include "llvm/IR/GlobalVariable.h"
  47 #include "llvm/IR/Instructions.h"
  48 #include "llvm/IR/Intrinsics.h"
  49 #include "llvm/MC/MCAsmInfo.h"
  50 #include "llvm/MC/MCContext.h"
  51 #include "llvm/MC/MCExpr.h"
  52 #include "llvm/MC/MCSymbol.h"
  53 #include "llvm/Support/CommandLine.h"
  54 #include "llvm/Support/Debug.h"
  55 #include "llvm/Support/ErrorHandling.h"
  56 #include "llvm/Support/KnownBits.h"
  57 #include "llvm/Support/MathExtras.h"
  58 #include "llvm/Target/TargetLowering.h"
  59 #include "llvm/Target/TargetOptions.h"
  60 #include <algorithm>
  61 #include <bitset>
  62 #include <cctype>
  63 #include <numeric>
  64 using namespace llvm;
  65
  66 #define DEBUG_TYPE "x86-isel"
  67
  68 STATISTIC(NumTailCalls, "Number of tail calls");
  69
  70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  71     "x86-experimental-vector-widening-legalization", cl::init(false),
  72     cl::desc("Enable an experimental vector type legalization through widening "
  73              "rather than promotion."),
  74     cl::Hidden);
  75
  76 static cl::opt<int> ExperimentalPrefLoopAlignment(
  77     "x86-experimental-pref-loop-alignment", cl::init(4),
  78     cl::desc("Sets the preferable loop alignment for experiments "
  79              "(the last x86-experimental-pref-loop-alignment bits"
  80              " of the loop header PC will be 0)."),
  81     cl::Hidden);
  82
  83 static cl::opt<bool> MulConstantOptimization(
  84     "mul-constant-optimization", cl::init(true),
  85     cl::desc("Replace 'mul x, Const' with more effective instructions like "
  86              "SHIFT, LEA, etc."),
  87     cl::Hidden);
  88
  89 /// Call this when the user attempts to do something unsupported, like
  90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
  91 /// report_fatal_error, so calling code should attempt to recover without
  92 /// crashing.
  93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
  94                              const char *Msg) {
  95   MachineFunction &MF = DAG.getMachineFunction();
  96   DAG.getContext()->diagnose(
  97       DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
  98 }
  99
 100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 101                                      const X86Subtarget &STI)
 102     : TargetLowering(TM), Subtarget(STI) {
 103   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
 104   X86ScalarSSEf64 = Subtarget.hasSSE2();
 105   X86ScalarSSEf32 = Subtarget.hasSSE1();
 106   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 107
 108   // Set up the TargetLowering object.
 109
 110   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 111   setBooleanContents(ZeroOrOneBooleanContent);
 112   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 113   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 114
 115   // For 64-bit, since we have so many registers, use the ILP scheduler.
 116   // For 32-bit, use the register pressure specific scheduling.
 117   // For Atom, always use ILP scheduling.
 118   if (Subtarget.isAtom())
 119     setSchedulingPreference(Sched::ILP);
 120   else if (Subtarget.is64Bit())
 121     setSchedulingPreference(Sched::ILP);
 122   else
 123     setSchedulingPreference(Sched::RegPressure);
 124   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 125   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 126
 127   // Bypass expensive divides and use cheaper ones.
 128   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 129     if (Subtarget.hasSlowDivide32())
 130       addBypassSlowDiv(32, 8);
 131     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
 132       addBypassSlowDiv(64, 32);
 133   }
 134
 135   if (Subtarget.isTargetKnownWindowsMSVC() ||
 136       Subtarget.isTargetWindowsItanium()) {
 137     // Setup Windows compiler runtime calls.
 138     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 139     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 140     setLibcallName(RTLIB::SREM_I64, "_allrem");
 141     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 142     setLibcallName(RTLIB::MUL_I64, "_allmul");
 143     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 144     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 145     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 146     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 147     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 148   }
 149
 150   if (Subtarget.isTargetDarwin()) {
 151     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 152     setUseUnderscoreSetJmp(false);
 153     setUseUnderscoreLongJmp(false);
 154   } else if (Subtarget.isTargetWindowsGNU()) {
 155     // MS runtime is weird: it exports _setjmp, but longjmp!
 156     setUseUnderscoreSetJmp(true);
 157     setUseUnderscoreLongJmp(false);
 158   } else {
 159     setUseUnderscoreSetJmp(true);
 160     setUseUnderscoreLongJmp(true);
 161   }
 162
 163   // Set up the register classes.
 164   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 165   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 166   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 167   if (Subtarget.is64Bit())
 168     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 169
 170   for (MVT VT : MVT::integer_valuetypes())
 171     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 172
 173   // We don't accept any truncstore of integer registers.
 174   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 175   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 176   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 177   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 178   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 179   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 180
 181   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 182
 183   // SETOEQ and SETUNE require checking two conditions.
 184   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 185   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 186   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 187   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 188   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 189   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 190
 191   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 192   // operation.
 193   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 194   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 195   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 196
 197   if (Subtarget.is64Bit()) {
 198     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
 199       // f32/f64 are legal, f80 is custom.
 200       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
 201     else
 202       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
 203     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 204   } else if (!Subtarget.useSoftFloat()) {
 205     // We have an algorithm for SSE2->double, and we turn this into a
 206     // 64-bit FILD followed by conditional FADD for other targets.
 207     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 208     // We have an algorithm for SSE2, and we turn this into a 64-bit
 209     // FILD or VCVTUSI2SS/SD for other targets.
 210     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 211   }
 212
 213   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 214   // this operation.
 215   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 216   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 217
 218   if (!Subtarget.useSoftFloat()) {
 219     // SSE has no i16 to fp conversion, only i32.
 220     if (X86ScalarSSEf32) {
 221       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 222       // f32 and f64 cases are Legal, f80 case is not
 223       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 224     } else {
 225       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 226       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 227     }
 228   } else {
 229     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 230     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 231   }
 232
 233   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 234   // this operation.
 235   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 236   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 237
 238   if (!Subtarget.useSoftFloat()) {
 239     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 240     // are Legal, f80 is custom lowered.
 241     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 242     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 243
 244     if (X86ScalarSSEf32) {
 245       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 246       // f32 and f64 cases are Legal, f80 case is not
 247       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 248     } else {
 249       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 250       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 251     }
 252   } else {
 253     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 254     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
 255     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
 256   }
 257
 258   // Handle FP_TO_UINT by promoting the destination to a larger signed
 259   // conversion.
 260   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 261   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 262   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 263
 264   if (Subtarget.is64Bit()) {
 265     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
 266       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
 267       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 268       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
 269     } else {
 270       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
 271       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
 272     }
 273   } else if (!Subtarget.useSoftFloat()) {
 274     // Since AVX is a superset of SSE3, only check for SSE here.
 275     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
 276       // Expand FP_TO_UINT into a select.
 277       // FIXME: We would like to use a Custom expander here eventually to do
 278       // the optimal thing for SSE vs. the default expansion in the legalizer.
 279       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 280     else
 281       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
 282       // With SSE3 we can use fisttpll to convert to a signed i64; without
 283       // SSE, we're stuck with a fistpll.
 284       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 285
 286     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 287   }
 288
 289   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 290   if (!X86ScalarSSEf64) {
 291     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 292     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 293     if (Subtarget.is64Bit()) {
 294       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 295       // Without SSE, i64->f64 goes through memory.
 296       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 297     }
 298   } else if (!Subtarget.is64Bit())
 299     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 300
 301   // Scalar integer divide and remainder are lowered to use operations that
 302   // produce two results, to match the available instructions. This exposes
 303   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 304   // into a single instruction.
 305   //
 306   // Scalar integer multiply-high is also lowered to use two-result
 307   // operations, to match the available instructions. However, plain multiply
 308   // (low) operations are left as Legal, as there are single-result
 309   // instructions for this in x86. Using the two-result multiply instructions
 310   // when both high and low results are needed must be arranged by dagcombine.
 311   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 312     setOperationAction(ISD::MULHS, VT, Expand);
 313     setOperationAction(ISD::MULHU, VT, Expand);
 314     setOperationAction(ISD::SDIV, VT, Expand);
 315     setOperationAction(ISD::UDIV, VT, Expand);
 316     setOperationAction(ISD::SREM, VT, Expand);
 317     setOperationAction(ISD::UREM, VT, Expand);
 318   }
 319
 320   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 321   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 322   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
 323                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
 324     setOperationAction(ISD::BR_CC,     VT, Expand);
 325     setOperationAction(ISD::SELECT_CC, VT, Expand);
 326   }
 327   if (Subtarget.is64Bit())
 328     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 329   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 330   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 331   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 332   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 333
 334   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 335   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 336   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 337   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 338
 339   // Promote the i8 variants and force them on up to i32 which has a shorter
 340   // encoding.
 341   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
 342   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 343   if (!Subtarget.hasBMI()) {
 344     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 345     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 346     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
 347     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
 348     if (Subtarget.is64Bit()) {
 349       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 350       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
 351     }
 352   }
 353
 354   if (Subtarget.hasLZCNT()) {
 355     // When promoting the i8 variants, force them to i32 for a shorter
 356     // encoding.
 357     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
 358     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 359   } else {
 360     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 361     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 362     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 363     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 364     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 365     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 366     if (Subtarget.is64Bit()) {
 367       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 368       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 369     }
 370   }
 371
 372   // Special handling for half-precision floating point conversions.
 373   // If we don't have F16C support, then lower half float conversions
 374   // into library calls.
 375   if (Subtarget.useSoftFloat() ||
 376       (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
 377     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 378     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 379   }
 380
 381   // There's never any support for operations beyond MVT::f32.
 382   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 383   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 384   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 385   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 386
 387   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 388   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 389   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 390   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 391   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 392   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 393
 394   if (Subtarget.hasPOPCNT()) {
 395     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 396   } else {
 397     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 398     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 399     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 400     if (Subtarget.is64Bit())
 401       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 402   }
 403
 404   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 405
 406   if (!Subtarget.hasMOVBE())
 407     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 408
 409   // These should be promoted to a larger select which is supported.
 410   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 411   // X86 wants to expand cmov itself.
 412   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
 413     setOperationAction(ISD::SELECT, VT, Custom);
 414     setOperationAction(ISD::SETCC, VT, Custom);
 415   }
 416   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 417     if (VT == MVT::i64 && !Subtarget.is64Bit())
 418       continue;
 419     setOperationAction(ISD::SELECT, VT, Custom);
 420     setOperationAction(ISD::SETCC,  VT, Custom);
 421   }
 422
 423   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
 424   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
 425   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 426
 427   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 428   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 429   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 430   // support continuation, user-level threading, and etc.. As a result, no
 431   // other SjLj exception interfaces are implemented and please don't build
 432   // your own exception handling based on them.
 433   // LLVM/Clang supports zero-cost DWARF exception handling.
 434   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 435   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 436   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 437   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
 438     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 439
 440   // Darwin ABI issue.
 441   for (auto VT : { MVT::i32, MVT::i64 }) {
 442     if (VT == MVT::i64 && !Subtarget.is64Bit())
 443       continue;
 444     setOperationAction(ISD::ConstantPool    , VT, Custom);
 445     setOperationAction(ISD::JumpTable       , VT, Custom);
 446     setOperationAction(ISD::GlobalAddress   , VT, Custom);
 447     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
 448     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
 449     setOperationAction(ISD::BlockAddress    , VT, Custom);
 450   }
 451
 452   // 64-bit shl, sra, srl (iff 32-bit x86)
 453   for (auto VT : { MVT::i32, MVT::i64 }) {
 454     if (VT == MVT::i64 && !Subtarget.is64Bit())
 455       continue;
 456     setOperationAction(ISD::SHL_PARTS, VT, Custom);
 457     setOperationAction(ISD::SRA_PARTS, VT, Custom);
 458     setOperationAction(ISD::SRL_PARTS, VT, Custom);
 459   }
 460
 461   if (Subtarget.hasSSE1())
 462     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 463
 464   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 465
 466   // Expand certain atomics
 467   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 468     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 469     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 470     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
 471     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
 472     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
 473     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
 474     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 475   }
 476
 477   if (Subtarget.hasCmpxchg16b()) {
 478     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 479   }
 480
 481   // FIXME - use subtarget debug flags
 482   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
 483       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
 484       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
 485     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 486   }
 487
 488   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 489   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 490
 491   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 492   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 493
 494   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 495   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 496
 497   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 498   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 499   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 500   bool Is64Bit = Subtarget.is64Bit();
 501   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
 502   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 503
 504   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 505   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 506
 507   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 508
 509   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 510   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 511   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 512
 513   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
 514     // f32 and f64 use SSE.
 515     // Set up the FP register classes.
 516     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 517                                                      : &X86::FR32RegClass);
 518     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
 519                                                      : &X86::FR64RegClass);
 520
 521     for (auto VT : { MVT::f32, MVT::f64 }) {
 522       // Use ANDPD to simulate FABS.
 523       setOperationAction(ISD::FABS, VT, Custom);
 524
 525       // Use XORP to simulate FNEG.
 526       setOperationAction(ISD::FNEG, VT, Custom);
 527
 528       // Use ANDPD and ORPD to simulate FCOPYSIGN.
 529       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 530
 531       // We don't support sin/cos/fmod
 532       setOperationAction(ISD::FSIN   , VT, Expand);
 533       setOperationAction(ISD::FCOS   , VT, Expand);
 534       setOperationAction(ISD::FSINCOS, VT, Expand);
 535     }
 536
 537     // Lower this to MOVMSK plus an AND.
 538     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 539     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 540
 541     // Expand FP immediates into loads from the stack, except for the special
 542     // cases we handle.
 543     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 544     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 545   } else if (UseX87 && X86ScalarSSEf32) {
 546     // Use SSE for f32, x87 for f64.
 547     // Set up the FP register classes.
 548     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 549                                                      : &X86::FR32RegClass);
 550     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 551
 552     // Use ANDPS to simulate FABS.
 553     setOperationAction(ISD::FABS , MVT::f32, Custom);
 554
 555     // Use XORP to simulate FNEG.
 556     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 557
 558     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 559
 560     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 561     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 562     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 563
 564     // We don't support sin/cos/fmod
 565     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 566     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 567     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 568
 569     // Special cases we handle for FP constants.
 570     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 571     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 572     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 573     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 574     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 575
 576     if (!TM.Options.UnsafeFPMath) {
 577       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 578       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 579       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 580     }
 581   } else if (UseX87) {
 582     // f32 and f64 in x87.
 583     // Set up the FP register classes.
 584     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 585     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 586
 587     for (auto VT : { MVT::f32, MVT::f64 }) {
 588       setOperationAction(ISD::UNDEF,     VT, Expand);
 589       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 590
 591       if (!TM.Options.UnsafeFPMath) {
 592         setOperationAction(ISD::FSIN   , VT, Expand);
 593         setOperationAction(ISD::FCOS   , VT, Expand);
 594         setOperationAction(ISD::FSINCOS, VT, Expand);
 595       }
 596     }
 597     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 598     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 599     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 600     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 601     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 602     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 603     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 604     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 605   }
 606
 607   // We don't support FMA.
 608   setOperationAction(ISD::FMA, MVT::f64, Expand);
 609   setOperationAction(ISD::FMA, MVT::f32, Expand);
 610
 611   // Long double always uses X87, except f128 in MMX.
 612   if (UseX87) {
 613     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
 614       addRegisterClass(MVT::f128, &X86::FR128RegClass);
 615       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
 616       setOperationAction(ISD::FABS , MVT::f128, Custom);
 617       setOperationAction(ISD::FNEG , MVT::f128, Custom);
 618       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 619     }
 620
 621     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 622     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 623     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 624     {
 625       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
 626       addLegalFPImmediate(TmpFlt);  // FLD0
 627       TmpFlt.changeSign();
 628       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 629
 630       bool ignored;
 631       APFloat TmpFlt2(+1.0);
 632       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
 633                       &ignored);
 634       addLegalFPImmediate(TmpFlt2);  // FLD1
 635       TmpFlt2.changeSign();
 636       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 637     }
 638
 639     if (!TM.Options.UnsafeFPMath) {
 640       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 641       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 642       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 643     }
 644
 645     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 646     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 647     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 648     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 649     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 650     setOperationAction(ISD::FMA, MVT::f80, Expand);
 651   }
 652
 653   // Always use a library call for pow.
 654   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 655   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 656   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 657
 658   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 659   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 660   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 661   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 662   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 663   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 664   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 665
 666   // Some FP actions are always expanded for vector types.
 667   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
 668                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
 669     setOperationAction(ISD::FSIN,      VT, Expand);
 670     setOperationAction(ISD::FSINCOS,   VT, Expand);
 671     setOperationAction(ISD::FCOS,      VT, Expand);
 672     setOperationAction(ISD::FREM,      VT, Expand);
 673     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 674     setOperationAction(ISD::FPOW,      VT, Expand);
 675     setOperationAction(ISD::FLOG,      VT, Expand);
 676     setOperationAction(ISD::FLOG2,     VT, Expand);
 677     setOperationAction(ISD::FLOG10,    VT, Expand);
 678     setOperationAction(ISD::FEXP,      VT, Expand);
 679     setOperationAction(ISD::FEXP2,     VT, Expand);
 680   }
 681
 682   // First set operation action for all vector types to either promote
 683   // (for widening) or expand (for scalarization). Then we will selectively
 684   // turn on ones that can be effectively codegen'd.
 685   for (MVT VT : MVT::vector_valuetypes()) {
 686     setOperationAction(ISD::SDIV, VT, Expand);
 687     setOperationAction(ISD::UDIV, VT, Expand);
 688     setOperationAction(ISD::SREM, VT, Expand);
 689     setOperationAction(ISD::UREM, VT, Expand);
 690     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 691     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 692     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 693     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 694     setOperationAction(ISD::FMA,  VT, Expand);
 695     setOperationAction(ISD::FFLOOR, VT, Expand);
 696     setOperationAction(ISD::FCEIL, VT, Expand);
 697     setOperationAction(ISD::FTRUNC, VT, Expand);
 698     setOperationAction(ISD::FRINT, VT, Expand);
 699     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 700     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 701     setOperationAction(ISD::MULHS, VT, Expand);
 702     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 703     setOperationAction(ISD::MULHU, VT, Expand);
 704     setOperationAction(ISD::SDIVREM, VT, Expand);
 705     setOperationAction(ISD::UDIVREM, VT, Expand);
 706     setOperationAction(ISD::CTPOP, VT, Expand);
 707     setOperationAction(ISD::CTTZ, VT, Expand);
 708     setOperationAction(ISD::CTLZ, VT, Expand);
 709     setOperationAction(ISD::ROTL, VT, Expand);
 710     setOperationAction(ISD::ROTR, VT, Expand);
 711     setOperationAction(ISD::BSWAP, VT, Expand);
 712     setOperationAction(ISD::SETCC, VT, Expand);
 713     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 714     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 715     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 716     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 717     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 718     setOperationAction(ISD::TRUNCATE, VT, Expand);
 719     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 720     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 721     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 722     setOperationAction(ISD::SELECT_CC, VT, Expand);
 723     for (MVT InnerVT : MVT::vector_valuetypes()) {
 724       setTruncStoreAction(InnerVT, VT, Expand);
 725
 726       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 727       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 728
 729       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 730       // types, we have to deal with them whether we ask for Expansion or not.
 731       // Setting Expand causes its own optimisation problems though, so leave
 732       // them legal.
 733       if (VT.getVectorElementType() == MVT::i1)
 734         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 735
 736       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 737       // split/scalarized right now.
 738       if (VT.getVectorElementType() == MVT::f16)
 739         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 740     }
 741   }
 742
 743   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 744   // with -msoft-float, disable use of MMX as well.
 745   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
 746     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 747     // No operations on x86mmx supported, everything uses intrinsics.
 748   }
 749
 750   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
 751     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 752                                                     : &X86::VR128RegClass);
 753
 754     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 755     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 756     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
 757     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 758     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 759     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
 760     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 761     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 762     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 763   }
 764
 765   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
 766     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
 767                                                     : &X86::VR128RegClass);
 768
 769     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 770     // registers cannot be used even for integer operations.
 771     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
 772                                                     : &X86::VR128RegClass);
 773     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
 774                                                     : &X86::VR128RegClass);
 775     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 776                                                     : &X86::VR128RegClass);
 777     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
 778                                                     : &X86::VR128RegClass);
 779
 780     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
 781     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 782     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 783     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 784     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 785     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
 786     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
 787     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 788     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 789     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 790     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 791     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 792     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 793
 794     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
 795     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
 796     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
 797     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
 798
 799     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 800     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 801     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 802
 803     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 804       setOperationAction(ISD::SETCC,              VT, Custom);
 805       setOperationAction(ISD::CTPOP,              VT, Custom);
 806       setOperationAction(ISD::CTTZ,               VT, Custom);
 807     }
 808
 809     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 810       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
 811       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 812       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 813       setOperationAction(ISD::VSELECT,            VT, Custom);
 814       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 815     }
 816
 817     // We support custom legalizing of sext and anyext loads for specific
 818     // memory vector types which we can load as a scalar (or sequence of
 819     // scalars) and extend in-register to a legal 128-bit vector type. For sext
 820     // loads these must work with a single scalar load.
 821     for (MVT VT : MVT::integer_vector_valuetypes()) {
 822       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
 823       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
 824       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
 825       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
 826       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
 827       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
 828       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
 829       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
 830       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
 831     }
 832
 833     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
 834       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 835       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 836       setOperationAction(ISD::VSELECT,            VT, Custom);
 837
 838       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
 839         continue;
 840
 841       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
 842       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 843     }
 844
 845     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
 846     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 847       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
 848       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
 849       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
 850       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
 851       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
 852     }
 853
 854     // Custom lower v2i64 and v2f64 selects.
 855     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
 856     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 857
 858     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
 859     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
 860
 861     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 862     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 863
 864     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
 865     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
 866     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
 867
 868     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
 869     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
 870
 871     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
 872     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 873
 874     for (MVT VT : MVT::fp_vector_valuetypes())
 875       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 876
 877     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
 878     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
 879     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
 880
 881     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
 882     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
 883     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 884
 885     // In the customized shift lowering, the legal v4i32/v2i64 cases
 886     // in AVX2 will be recognized.
 887     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 888       setOperationAction(ISD::SRL,              VT, Custom);
 889       setOperationAction(ISD::SHL,              VT, Custom);
 890       setOperationAction(ISD::SRA,              VT, Custom);
 891     }
 892   }
 893
 894   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
 895     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
 896     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
 897     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
 898     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
 899     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
 900     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
 901     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
 902     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
 903   }
 904
 905   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
 906     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
 907       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
 908       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
 909       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
 910       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
 911       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
 912     }
 913
 914     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
 915     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
 916     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
 917     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
 918     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
 919     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
 920     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
 921     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 922
 923     // FIXME: Do we need to handle scalar-to-vector here?
 924     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 925
 926     // We directly match byte blends in the backend as they match the VSELECT
 927     // condition form.
 928     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 929
 930     // SSE41 brings specific instructions for doing vector sign extend even in
 931     // cases where we don't have SRA.
 932     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 933       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
 934       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
 935     }
 936
 937     for (MVT VT : MVT::integer_vector_valuetypes()) {
 938       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
 939       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
 940       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
 941     }
 942
 943     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
 944     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
 945       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
 946       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
 947       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
 948       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
 949       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
 950       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
 951     }
 952
 953     // i8 vectors are custom because the source register and source
 954     // source memory operand types are not the same width.
 955     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
 956   }
 957
 958   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
 959     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 960                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 961       setOperationAction(ISD::ROTL, VT, Custom);
 962
 963     // XOP can efficiently perform BITREVERSE with VPPERM.
 964     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
 965       setOperationAction(ISD::BITREVERSE, VT, Custom);
 966
 967     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 968                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 969       setOperationAction(ISD::BITREVERSE, VT, Custom);
 970   }
 971
 972   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
 973     bool HasInt256 = Subtarget.hasInt256();
 974
 975     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 976                                                      : &X86::VR256RegClass);
 977     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
 978                                                      : &X86::VR256RegClass);
 979     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 980                                                      : &X86::VR256RegClass);
 981     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 982                                                      : &X86::VR256RegClass);
 983     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 984                                                      : &X86::VR256RegClass);
 985     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 986                                                      : &X86::VR256RegClass);
 987
 988     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
 989       setOperationAction(ISD::FFLOOR,     VT, Legal);
 990       setOperationAction(ISD::FCEIL,      VT, Legal);
 991       setOperationAction(ISD::FTRUNC,     VT, Legal);
 992       setOperationAction(ISD::FRINT,      VT, Legal);
 993       setOperationAction(ISD::FNEARBYINT, VT, Legal);
 994       setOperationAction(ISD::FNEG,       VT, Custom);
 995       setOperationAction(ISD::FABS,       VT, Custom);
 996       setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
 997     }
 998
 999     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1000     // even though v8i16 is a legal type.
1001     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1002     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1003     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1004
1005     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1006     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1007     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1008
1009     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1010     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1011
1012     for (MVT VT : MVT::fp_vector_valuetypes())
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1014
1015     // In the customized shift lowering, the legal v8i32/v4i64 cases
1016     // in AVX2 will be recognized.
1017     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1018       setOperationAction(ISD::SRL, VT, Custom);
1019       setOperationAction(ISD::SHL, VT, Custom);
1020       setOperationAction(ISD::SRA, VT, Custom);
1021     }
1022
1023     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1024     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1025     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1026
1027     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1028       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1029       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1030       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1031     }
1032
1033     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1034     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1035     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1036     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1037
1038     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1039       setOperationAction(ISD::SETCC,           VT, Custom);
1040       setOperationAction(ISD::CTPOP,           VT, Custom);
1041       setOperationAction(ISD::CTTZ,            VT, Custom);
1042       setOperationAction(ISD::CTLZ,            VT, Custom);
1043     }
1044
1045     if (Subtarget.hasAnyFMA()) {
1046       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1047                        MVT::v2f64, MVT::v4f64 })
1048         setOperationAction(ISD::FMA, VT, Legal);
1049     }
1050
1051     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1052       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1053       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054     }
1055
1056     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1057     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1058     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1059     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1060
1061     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1062     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1063
1064     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1065     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1066     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1067     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1068
1069     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1070       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1071       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1072       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1073       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1074       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075     }
1076
1077     if (HasInt256) {
1078       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1079       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1080       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1081
1082       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1083       // when we have a 256bit-wide blend with immediate.
1084       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1085
1086       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1087       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1088         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1089         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1090         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1091         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1092         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1093         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1094       }
1095     }
1096
1097     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1098                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1099       setOperationAction(ISD::MLOAD,  VT, Legal);
1100       setOperationAction(ISD::MSTORE, VT, Legal);
1101     }
1102
1103     // Extract subvector is special because the value type
1104     // (result) is 128-bit but the source is 256-bit wide.
1105     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1106                      MVT::v4f32, MVT::v2f64 }) {
1107       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1108     }
1109
1110     // Custom lower several nodes for 256-bit types.
1111     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1112                     MVT::v8f32, MVT::v4f64 }) {
1113       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1114       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1115       setOperationAction(ISD::VSELECT,            VT, Custom);
1116       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1117       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1118       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1119       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1120       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1121     }
1122
1123     if (HasInt256)
1124       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1125
1126     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1127     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1128       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1129       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1130       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1131       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1132       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1133     }
1134   }
1135
1136   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1137     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1138     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1139     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1140     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1141
1142     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1143     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1144     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1145
1146     for (MVT VT : MVT::fp_vector_valuetypes())
1147       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1148
1149     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1150       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1151       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1152       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1153       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1154       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1155       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1156     }
1157
1158     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1159                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1160                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1161       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1162       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1163       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1164       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1165       setTruncStoreAction(VT, MaskVT, Custom);
1166     }
1167
1168     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1169       setOperationAction(ISD::FNEG,  VT, Custom);
1170       setOperationAction(ISD::FABS,  VT, Custom);
1171       setOperationAction(ISD::FMA,   VT, Legal);
1172       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1173     }
1174
1175     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1176     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1178     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1179     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1182     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1183     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1184     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1186     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1187     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1188     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1189     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1190     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
1191     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
1192     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
1193     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
1194     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
1195     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
1196     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
1197     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
1198     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1199     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1200
1201     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1202     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1203     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1204     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1205     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1206     if (Subtarget.hasVLX()){
1207       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1208       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1209       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1210       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1211       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1212
1213       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1214       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1215       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1216       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1217       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1218     } else {
1219       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1220            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1221         setOperationAction(ISD::MLOAD,  VT, Custom);
1222         setOperationAction(ISD::MSTORE, VT, Custom);
1223       }
1224     }
1225     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1226     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1227
1228     if (Subtarget.hasDQI()) {
1229       for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1230         setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
1231         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
1232         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
1233         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
1234       }
1235       if (Subtarget.hasVLX()) {
1236         // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1237         setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
1238         setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
1239         setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
1240       }
1241     }
1242     if (Subtarget.hasVLX()) {
1243       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1244       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1245       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1246       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1247       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1248       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1249       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1250       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1251       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1252       setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
1253       setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
1254
1255       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1256       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1257       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1258       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1259       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1260       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1261       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1262       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1263       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1264       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1265       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1266     }
1267
1268     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1269     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1270     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1271     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1272     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1273     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1274     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1275     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1276     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1277     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1278
1279     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1280       setOperationAction(ISD::FFLOOR,           VT, Legal);
1281       setOperationAction(ISD::FCEIL,            VT, Legal);
1282       setOperationAction(ISD::FTRUNC,           VT, Legal);
1283       setOperationAction(ISD::FRINT,            VT, Legal);
1284       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
1285     }
1286
1287     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
1288     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1289
1290     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1291     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1292     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1293
1294     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1295     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1296     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1297     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1298     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1299
1300     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
1301
1302     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1303     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1304     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1305     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1306     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1307     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1308
1309     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1310
1311     // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1312     setOperationAction(ISD::ABS,                MVT::v4i64, Legal);
1313     setOperationAction(ISD::ABS,                MVT::v2i64, Legal);
1314
1315     for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1316       setOperationAction(ISD::ADD,              VT, Custom);
1317       setOperationAction(ISD::SUB,              VT, Custom);
1318       setOperationAction(ISD::MUL,              VT, Custom);
1319       setOperationAction(ISD::SETCC,            VT, Custom);
1320       setOperationAction(ISD::SELECT,           VT, Custom);
1321       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1322
1323       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1324       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1325       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1326       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1327       setOperationAction(ISD::VSELECT,          VT,  Expand);
1328     }
1329
1330     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331       setOperationAction(ISD::SMAX,             VT, Legal);
1332       setOperationAction(ISD::UMAX,             VT, Legal);
1333       setOperationAction(ISD::SMIN,             VT, Legal);
1334       setOperationAction(ISD::UMIN,             VT, Legal);
1335       setOperationAction(ISD::ABS,              VT, Legal);
1336       setOperationAction(ISD::SRL,              VT, Custom);
1337       setOperationAction(ISD::SHL,              VT, Custom);
1338       setOperationAction(ISD::SRA,              VT, Custom);
1339       setOperationAction(ISD::CTPOP,            VT, Custom);
1340       setOperationAction(ISD::CTTZ,             VT, Custom);
1341     }
1342
1343     // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1344     for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1345                     MVT::v8i64}) {
1346       setOperationAction(ISD::ROTL,             VT, Custom);
1347       setOperationAction(ISD::ROTR,             VT, Custom);
1348     }
1349
1350     // Need to promote to 64-bit even though we have 32-bit masked instructions
1351     // because the IR optimizers rearrange bitcasts around logic ops leaving
1352     // too many variations to handle if we don't promote them.
1353     setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1354     setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
1355     setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1356
1357     if (Subtarget.hasCDI()) {
1358       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1359       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1360                       MVT::v4i64, MVT::v8i64}) {
1361         setOperationAction(ISD::CTLZ,            VT, Legal);
1362         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1363       }
1364     } // Subtarget.hasCDI()
1365
1366     if (Subtarget.hasDQI()) {
1367       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1368       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1369       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1370       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1371     }
1372
1373     if (Subtarget.hasVPOPCNTDQ()) {
1374       // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1375       // version of popcntd/q.
1376       for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1377                       MVT::v4i32, MVT::v2i64})
1378         setOperationAction(ISD::CTPOP, VT, Legal);
1379     }
1380
1381     // Custom lower several nodes.
1382     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1383                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1384       setOperationAction(ISD::MGATHER,  VT, Custom);
1385       setOperationAction(ISD::MSCATTER, VT, Custom);
1386     }
1387     // Extract subvector is special because the value type
1388     // (result) is 256-bit but the source is 512-bit wide.
1389     // 128-bit was made Custom under AVX1.
1390     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1391                      MVT::v8f32, MVT::v4f64, MVT::v1i1 })
1392       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1393     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1394                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1395       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1396
1397     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1399       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1400       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1401       setOperationAction(ISD::VSELECT,             VT, Custom);
1402       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1403       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1404       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
1405       setOperationAction(ISD::MLOAD,               VT, Legal);
1406       setOperationAction(ISD::MSTORE,              VT, Legal);
1407       setOperationAction(ISD::MGATHER,             VT, Legal);
1408       setOperationAction(ISD::MSCATTER,            VT, Custom);
1409     }
1410     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
1412       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1413     }
1414   }// has  AVX-512
1415
1416   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1417     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1418     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1419
1420     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1421     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1422
1423     setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
1424     setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
1425     setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
1426     setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
1427     setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
1428     setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
1429
1430     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1431     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1432     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1433     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1434     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1435     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1436     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1437     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1438     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1439     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1440     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1441     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1442     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
1443     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
1444     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1445     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1446     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
1447     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1448     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1449     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1450     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1451     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1452     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1453     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1454     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1455     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1456     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1457     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1458     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1459     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1460     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1461     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1462     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1463     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1464     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1468     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1469     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1470     setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1471     setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1472     setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1473     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1474     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1475
1476     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1477
1478     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1479     if (Subtarget.hasVLX()) {
1480       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1481       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1482     }
1483
1484     LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1485     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1486       setOperationAction(ISD::MLOAD,               VT, Action);
1487       setOperationAction(ISD::MSTORE,              VT, Action);
1488     }
1489
1490     if (Subtarget.hasCDI()) {
1491       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1492       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1493     }
1494
1495     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1496       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1497       setOperationAction(ISD::VSELECT,      VT, Custom);
1498       setOperationAction(ISD::ABS,          VT, Legal);
1499       setOperationAction(ISD::SRL,          VT, Custom);
1500       setOperationAction(ISD::SHL,          VT, Custom);
1501       setOperationAction(ISD::SRA,          VT, Custom);
1502       setOperationAction(ISD::MLOAD,        VT, Legal);
1503       setOperationAction(ISD::MSTORE,       VT, Legal);
1504       setOperationAction(ISD::CTPOP,        VT, Custom);
1505       setOperationAction(ISD::CTTZ,         VT, Custom);
1506       setOperationAction(ISD::SMAX,         VT, Legal);
1507       setOperationAction(ISD::UMAX,         VT, Legal);
1508       setOperationAction(ISD::SMIN,         VT, Legal);
1509       setOperationAction(ISD::UMIN,         VT, Legal);
1510
1511       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1512       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1513       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1514     }
1515
1516     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1517       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1518       if (Subtarget.hasVLX()) {
1519         // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1520         setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1521         setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1522       }
1523     }
1524   }
1525
1526   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1527     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1528     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1529
1530     for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1531       setOperationAction(ISD::ADD,                VT, Custom);
1532       setOperationAction(ISD::SUB,                VT, Custom);
1533       setOperationAction(ISD::MUL,                VT, Custom);
1534       setOperationAction(ISD::VSELECT,            VT, Expand);
1535
1536       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1537       setOperationAction(ISD::SETCC,              VT, Custom);
1538       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1539       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1540       setOperationAction(ISD::SELECT,             VT, Custom);
1541       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1542       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1543     }
1544
1545     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1546     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1547     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1548     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1549
1550     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1551       setOperationAction(ISD::SMAX, VT, Legal);
1552       setOperationAction(ISD::UMAX, VT, Legal);
1553       setOperationAction(ISD::SMIN, VT, Legal);
1554       setOperationAction(ISD::UMIN, VT, Legal);
1555     }
1556   }
1557
1558   // We want to custom lower some of our intrinsics.
1559   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1560   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1561   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1562   if (!Subtarget.is64Bit()) {
1563     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1564     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1565   }
1566
1567   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1568   // handle type legalization for these operations here.
1569   //
1570   // FIXME: We really should do custom legalization for addition and
1571   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1572   // than generic legalization for 64-bit multiplication-with-overflow, though.
1573   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1574     if (VT == MVT::i64 && !Subtarget.is64Bit())
1575       continue;
1576     // Add/Sub/Mul with overflow operations are custom lowered.
1577     setOperationAction(ISD::SADDO, VT, Custom);
1578     setOperationAction(ISD::UADDO, VT, Custom);
1579     setOperationAction(ISD::SSUBO, VT, Custom);
1580     setOperationAction(ISD::USUBO, VT, Custom);
1581     setOperationAction(ISD::SMULO, VT, Custom);
1582     setOperationAction(ISD::UMULO, VT, Custom);
1583
1584     // Support carry in as value rather than glue.
1585     setOperationAction(ISD::ADDCARRY, VT, Custom);
1586     setOperationAction(ISD::SUBCARRY, VT, Custom);
1587     setOperationAction(ISD::SETCCCARRY, VT, Custom);
1588   }
1589
1590   if (!Subtarget.is64Bit()) {
1591     // These libcalls are not available in 32-bit.
1592     setLibcallName(RTLIB::SHL_I128, nullptr);
1593     setLibcallName(RTLIB::SRL_I128, nullptr);
1594     setLibcallName(RTLIB::SRA_I128, nullptr);
1595   }
1596
1597   // Combine sin / cos into one node or libcall if possible.
1598   if (Subtarget.hasSinCos()) {
1599     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1600     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1601     if (Subtarget.isTargetDarwin()) {
1602       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1603       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1604       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1605       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1606     }
1607   }
1608
1609   if (Subtarget.isTargetWin64()) {
1610     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1611     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1612     setOperationAction(ISD::SREM, MVT::i128, Custom);
1613     setOperationAction(ISD::UREM, MVT::i128, Custom);
1614     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1615     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1616   }
1617
1618   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1619   // is. We should promote the value to 64-bits to solve this.
1620   // This is what the CRT headers do - `fmodf` is an inline header
1621   // function casting to f64 and calling `fmod`.
1622   if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1623                               Subtarget.isTargetWindowsItanium()))
1624     for (ISD::NodeType Op :
1625          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1626           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1627       if (isOperationExpand(Op, MVT::f32))
1628         setOperationAction(Op, MVT::f32, Promote);
1629
1630   // We have target-specific dag combine patterns for the following nodes:
1631   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1632   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1633   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1634   setTargetDAGCombine(ISD::BITCAST);
1635   setTargetDAGCombine(ISD::VSELECT);
1636   setTargetDAGCombine(ISD::SELECT);
1637   setTargetDAGCombine(ISD::SHL);
1638   setTargetDAGCombine(ISD::SRA);
1639   setTargetDAGCombine(ISD::SRL);
1640   setTargetDAGCombine(ISD::OR);
1641   setTargetDAGCombine(ISD::AND);
1642   setTargetDAGCombine(ISD::ADD);
1643   setTargetDAGCombine(ISD::FADD);
1644   setTargetDAGCombine(ISD::FSUB);
1645   setTargetDAGCombine(ISD::FNEG);
1646   setTargetDAGCombine(ISD::FMA);
1647   setTargetDAGCombine(ISD::FMINNUM);
1648   setTargetDAGCombine(ISD::FMAXNUM);
1649   setTargetDAGCombine(ISD::SUB);
1650   setTargetDAGCombine(ISD::LOAD);
1651   setTargetDAGCombine(ISD::MLOAD);
1652   setTargetDAGCombine(ISD::STORE);
1653   setTargetDAGCombine(ISD::MSTORE);
1654   setTargetDAGCombine(ISD::TRUNCATE);
1655   setTargetDAGCombine(ISD::ZERO_EXTEND);
1656   setTargetDAGCombine(ISD::ANY_EXTEND);
1657   setTargetDAGCombine(ISD::SIGN_EXTEND);
1658   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1660   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1661   setTargetDAGCombine(ISD::SINT_TO_FP);
1662   setTargetDAGCombine(ISD::UINT_TO_FP);
1663   setTargetDAGCombine(ISD::SETCC);
1664   setTargetDAGCombine(ISD::MUL);
1665   setTargetDAGCombine(ISD::XOR);
1666   setTargetDAGCombine(ISD::MSCATTER);
1667   setTargetDAGCombine(ISD::MGATHER);
1668
1669   computeRegisterProperties(Subtarget.getRegisterInfo());
1670
1671   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1672   MaxStoresPerMemsetOptSize = 8;
1673   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1674   MaxStoresPerMemcpyOptSize = 4;
1675   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1676   MaxStoresPerMemmoveOptSize = 4;
1677
1678   // TODO: These control memcmp expansion in CGP and could be raised higher, but
1679   // that needs to benchmarked and balanced with the potential use of vector
1680   // load/store types (PR33329, PR33914).
1681   MaxLoadsPerMemcmp = 2;
1682   MaxLoadsPerMemcmpOptSize = 2;
1683
1684   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1685   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1686
1687   // An out-of-order CPU can speculatively execute past a predictable branch,
1688   // but a conditional move could be stalled by an expensive earlier operation.
1689   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1690   EnableExtLdPromotion = true;
1691   setPrefFunctionAlignment(4); // 2^4 bytes.
1692
1693   verifyIntrinsicTables();
1694 }
1695
1696 // This has so far only been implemented for 64-bit MachO.
1697 bool X86TargetLowering::useLoadStackGuardNode() const {
1698   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1699 }
1700
1701 TargetLoweringBase::LegalizeTypeAction
1702 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1703   if (ExperimentalVectorWideningLegalization &&
1704       VT.getVectorNumElements() != 1 &&
1705       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1706     return TypeWidenVector;
1707
1708   return TargetLoweringBase::getPreferredVectorAction(VT);
1709 }
1710
1711 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1712                                           LLVMContext& Context,
1713                                           EVT VT) const {
1714   if (!VT.isVector())
1715     return MVT::i8;
1716
1717   if (VT.isSimple()) {
1718     MVT VVT = VT.getSimpleVT();
1719     const unsigned NumElts = VVT.getVectorNumElements();
1720     MVT EltVT = VVT.getVectorElementType();
1721     if (VVT.is512BitVector()) {
1722       if (Subtarget.hasAVX512())
1723         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1724             EltVT == MVT::f32 || EltVT == MVT::f64)
1725           switch(NumElts) {
1726           case  8: return MVT::v8i1;
1727           case 16: return MVT::v16i1;
1728         }
1729       if (Subtarget.hasBWI())
1730         if (EltVT == MVT::i8 || EltVT == MVT::i16)
1731           switch(NumElts) {
1732           case 32: return MVT::v32i1;
1733           case 64: return MVT::v64i1;
1734         }
1735     }
1736
1737     if (Subtarget.hasBWI() && Subtarget.hasVLX())
1738       return MVT::getVectorVT(MVT::i1, NumElts);
1739
1740     if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1741       EVT LegalVT = getTypeToTransformTo(Context, VT);
1742       EltVT = LegalVT.getVectorElementType().getSimpleVT();
1743     }
1744
1745     if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1746       switch(NumElts) {
1747       case 2: return MVT::v2i1;
1748       case 4: return MVT::v4i1;
1749       case 8: return MVT::v8i1;
1750       }
1751   }
1752
1753   return VT.changeVectorElementTypeToInteger();
1754 }
1755
1756 /// Helper for getByValTypeAlignment to determine
1757 /// the desired ByVal argument alignment.
1758 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1759   if (MaxAlign == 16)
1760     return;
1761   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1762     if (VTy->getBitWidth() == 128)
1763       MaxAlign = 16;
1764   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1765     unsigned EltAlign = 0;
1766     getMaxByValAlign(ATy->getElementType(), EltAlign);
1767     if (EltAlign > MaxAlign)
1768       MaxAlign = EltAlign;
1769   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1770     for (auto *EltTy : STy->elements()) {
1771       unsigned EltAlign = 0;
1772       getMaxByValAlign(EltTy, EltAlign);
1773       if (EltAlign > MaxAlign)
1774         MaxAlign = EltAlign;
1775       if (MaxAlign == 16)
1776         break;
1777     }
1778   }
1779 }
1780
1781 /// Return the desired alignment for ByVal aggregate
1782 /// function arguments in the caller parameter area. For X86, aggregates
1783 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1784 /// are at 4-byte boundaries.
1785 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1786                                                   const DataLayout &DL) const {
1787   if (Subtarget.is64Bit()) {
1788     // Max of 8 and alignment of type.
1789     unsigned TyAlign = DL.getABITypeAlignment(Ty);
1790     if (TyAlign > 8)
1791       return TyAlign;
1792     return 8;
1793   }
1794
1795   unsigned Align = 4;
1796   if (Subtarget.hasSSE1())
1797     getMaxByValAlign(Ty, Align);
1798   return Align;
1799 }
1800
1801 /// Returns the target specific optimal type for load
1802 /// and store operations as a result of memset, memcpy, and memmove
1803 /// lowering. If DstAlign is zero that means it's safe to destination
1804 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1805 /// means there isn't a need to check it against alignment requirement,
1806 /// probably because the source does not need to be loaded. If 'IsMemset' is
1807 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1808 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1809 /// source is constant so it does not need to be loaded.
1810 /// It returns EVT::Other if the type should be determined using generic
1811 /// target-independent logic.
1812 EVT
1813 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1814                                        unsigned DstAlign, unsigned SrcAlign,
1815                                        bool IsMemset, bool ZeroMemset,
1816                                        bool MemcpyStrSrc,
1817                                        MachineFunction &MF) const {
1818   const Function *F = MF.getFunction();
1819   if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1820     if (Size >= 16 &&
1821         (!Subtarget.isUnalignedMem16Slow() ||
1822          ((DstAlign == 0 || DstAlign >= 16) &&
1823           (SrcAlign == 0 || SrcAlign >= 16)))) {
1824       // FIXME: Check if unaligned 32-byte accesses are slow.
1825       if (Size >= 32 && Subtarget.hasAVX()) {
1826         // Although this isn't a well-supported type for AVX1, we'll let
1827         // legalization and shuffle lowering produce the optimal codegen. If we
1828         // choose an optimal type with a vector element larger than a byte,
1829         // getMemsetStores() may create an intermediate splat (using an integer
1830         // multiply) before we splat as a vector.
1831         return MVT::v32i8;
1832       }
1833       if (Subtarget.hasSSE2())
1834         return MVT::v16i8;
1835       // TODO: Can SSE1 handle a byte vector?
1836       if (Subtarget.hasSSE1())
1837         return MVT::v4f32;
1838     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1839                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1840       // Do not use f64 to lower memcpy if source is string constant. It's
1841       // better to use i32 to avoid the loads.
1842       // Also, do not use f64 to lower memset unless this is a memset of zeros.
1843       // The gymnastics of splatting a byte value into an XMM register and then
1844       // only using 8-byte stores (because this is a CPU with slow unaligned
1845       // 16-byte accesses) makes that a loser.
1846       return MVT::f64;
1847     }
1848   }
1849   // This is a compromise. If we reach here, unaligned accesses may be slow on
1850   // this target. However, creating smaller, aligned accesses could be even
1851   // slower and would certainly be a lot more code.
1852   if (Subtarget.is64Bit() && Size >= 8)
1853     return MVT::i64;
1854   return MVT::i32;
1855 }
1856
1857 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1858   if (VT == MVT::f32)
1859     return X86ScalarSSEf32;
1860   else if (VT == MVT::f64)
1861     return X86ScalarSSEf64;
1862   return true;
1863 }
1864
1865 bool
1866 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1867                                                   unsigned,
1868                                                   unsigned,
1869                                                   bool *Fast) const {
1870   if (Fast) {
1871     switch (VT.getSizeInBits()) {
1872     default:
1873       // 8-byte and under are always assumed to be fast.
1874       *Fast = true;
1875       break;
1876     case 128:
1877       *Fast = !Subtarget.isUnalignedMem16Slow();
1878       break;
1879     case 256:
1880       *Fast = !Subtarget.isUnalignedMem32Slow();
1881       break;
1882     // TODO: What about AVX-512 (512-bit) accesses?
1883     }
1884   }
1885   // Misaligned accesses of any size are always allowed.
1886   return true;
1887 }
1888
1889 /// Return the entry encoding for a jump table in the
1890 /// current function.  The returned value is a member of the
1891 /// MachineJumpTableInfo::JTEntryKind enum.
1892 unsigned X86TargetLowering::getJumpTableEncoding() const {
1893   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1894   // symbol.
1895   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1896     return MachineJumpTableInfo::EK_Custom32;
1897
1898   // Otherwise, use the normal jump table encoding heuristics.
1899   return TargetLowering::getJumpTableEncoding();
1900 }
1901
1902 bool X86TargetLowering::useSoftFloat() const {
1903   return Subtarget.useSoftFloat();
1904 }
1905
1906 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1907                                               ArgListTy &Args) const {
1908
1909   // Only relabel X86-32 for C / Stdcall CCs.
1910   if (Subtarget.is64Bit())
1911     return;
1912   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1913     return;
1914   unsigned ParamRegs = 0;
1915   if (auto *M = MF->getFunction()->getParent())
1916     ParamRegs = M->getNumberRegisterParameters();
1917
1918   // Mark the first N int arguments as having reg
1919   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1920     Type *T = Args[Idx].Ty;
1921     if (T->isPointerTy() || T->isIntegerTy())
1922       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1923         unsigned numRegs = 1;
1924         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1925           numRegs = 2;
1926         if (ParamRegs < numRegs)
1927           return;
1928         ParamRegs -= numRegs;
1929         Args[Idx].IsInReg = true;
1930       }
1931   }
1932 }
1933
1934 const MCExpr *
1935 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1936                                              const MachineBasicBlock *MBB,
1937                                              unsigned uid,MCContext &Ctx) const{
1938   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1939   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1940   // entries.
1941   return MCSymbolRefExpr::create(MBB->getSymbol(),
1942                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1943 }
1944
1945 /// Returns relocation base for the given PIC jumptable.
1946 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1947                                                     SelectionDAG &DAG) const {
1948   if (!Subtarget.is64Bit())
1949     // This doesn't have SDLoc associated with it, but is not really the
1950     // same as a Register.
1951     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1952                        getPointerTy(DAG.getDataLayout()));
1953   return Table;
1954 }
1955
1956 /// This returns the relocation base for the given PIC jumptable,
1957 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1958 const MCExpr *X86TargetLowering::
1959 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1960                              MCContext &Ctx) const {
1961   // X86-64 uses RIP relative addressing based on the jump table label.
1962   if (Subtarget.isPICStyleRIPRel())
1963     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1964
1965   // Otherwise, the reference is relative to the PIC base.
1966   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1967 }
1968
1969 std::pair<const TargetRegisterClass *, uint8_t>
1970 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1971                                            MVT VT) const {
1972   const TargetRegisterClass *RRC = nullptr;
1973   uint8_t Cost = 1;
1974   switch (VT.SimpleTy) {
1975   default:
1976     return TargetLowering::findRepresentativeClass(TRI, VT);
1977   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1978     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1979     break;
1980   case MVT::x86mmx:
1981     RRC = &X86::VR64RegClass;
1982     break;
1983   case MVT::f32: case MVT::f64:
1984   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1985   case MVT::v4f32: case MVT::v2f64:
1986   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1987   case MVT::v8f32: case MVT::v4f64:
1988   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1989   case MVT::v16f32: case MVT::v8f64:
1990     RRC = &X86::VR128XRegClass;
1991     break;
1992   }
1993   return std::make_pair(RRC, Cost);
1994 }
1995
1996 unsigned X86TargetLowering::getAddressSpace() const {
1997   if (Subtarget.is64Bit())
1998     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1999   return 256;
2000 }
2001
2002 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2003   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2004          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2005 }
2006
2007 static Constant* SegmentOffset(IRBuilder<> &IRB,
2008                                unsigned Offset, unsigned AddressSpace) {
2009   return ConstantExpr::getIntToPtr(
2010       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2011       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2012 }
2013
2014 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2015   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2016   // tcbhead_t; use it instead of the usual global variable (see
2017   // sysdeps/{i386,x86_64}/nptl/tls.h)
2018   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2019     if (Subtarget.isTargetFuchsia()) {
2020       // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2021       return SegmentOffset(IRB, 0x10, getAddressSpace());
2022     } else {
2023       // %fs:0x28, unless we're using a Kernel code model, in which case
2024       // it's %gs:0x28.  gs:0x14 on i386.
2025       unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2026       return SegmentOffset(IRB, Offset, getAddressSpace());
2027     }
2028   }
2029
2030   return TargetLowering::getIRStackGuard(IRB);
2031 }
2032
2033 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2034   // MSVC CRT provides functionalities for stack protection.
2035   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2036     // MSVC CRT has a global variable holding security cookie.
2037     M.getOrInsertGlobal("__security_cookie",
2038                         Type::getInt8PtrTy(M.getContext()));
2039
2040     // MSVC CRT has a function to validate security cookie.
2041     auto *SecurityCheckCookie = cast<Function>(
2042         M.getOrInsertFunction("__security_check_cookie",
2043                               Type::getVoidTy(M.getContext()),
2044                               Type::getInt8PtrTy(M.getContext())));
2045     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2046     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2047     return;
2048   }
2049   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2050   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2051     return;
2052   TargetLowering::insertSSPDeclarations(M);
2053 }
2054
2055 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2056   // MSVC CRT has a global variable holding security cookie.
2057   if (Subtarget.getTargetTriple().isOSMSVCRT())
2058     return M.getGlobalVariable("__security_cookie");
2059   return TargetLowering::getSDagStackGuard(M);
2060 }
2061
2062 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2063   // MSVC CRT has a function to validate security cookie.
2064   if (Subtarget.getTargetTriple().isOSMSVCRT())
2065     return M.getFunction("__security_check_cookie");
2066   return TargetLowering::getSSPStackGuardCheck(M);
2067 }
2068
2069 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2070   if (Subtarget.getTargetTriple().isOSContiki())
2071     return getDefaultSafeStackPointerLocation(IRB, false);
2072
2073   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2074   // definition of TLS_SLOT_SAFESTACK in
2075   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2076   if (Subtarget.isTargetAndroid()) {
2077     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2078     // %gs:0x24 on i386
2079     unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2080     return SegmentOffset(IRB, Offset, getAddressSpace());
2081   }
2082
2083   // Fuchsia is similar.
2084   if (Subtarget.isTargetFuchsia()) {
2085     // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2086     return SegmentOffset(IRB, 0x18, getAddressSpace());
2087   }
2088
2089   return TargetLowering::getSafeStackPointerLocation(IRB);
2090 }
2091
2092 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2093                                             unsigned DestAS) const {
2094   assert(SrcAS != DestAS && "Expected different address spaces!");
2095
2096   return SrcAS < 256 && DestAS < 256;
2097 }
2098
2099 //===----------------------------------------------------------------------===//
2100 //               Return Value Calling Convention Implementation
2101 //===----------------------------------------------------------------------===//
2102
2103 #include "X86GenCallingConv.inc"
2104
2105 bool X86TargetLowering::CanLowerReturn(
2106     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2107     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2108   SmallVector<CCValAssign, 16> RVLocs;
2109   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2110   return CCInfo.CheckReturn(Outs, RetCC_X86);
2111 }
2112
2113 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2114   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2115   return ScratchRegs;
2116 }
2117
2118 /// Lowers masks values (v*i1) to the local register values
2119 /// \returns DAG node after lowering to register type
2120 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2121                                const SDLoc &Dl, SelectionDAG &DAG) {
2122   EVT ValVT = ValArg.getValueType();
2123
2124   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2125       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2126     // Two stage lowering might be required
2127     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2128     // anyextend: i8   -> i32 / i16   -> i32
2129     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2130     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2131     if (ValLoc == MVT::i32)
2132       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2133     return ValToCopy;
2134   } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2135              (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2136     // One stage lowering is required
2137     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2138     return DAG.getBitcast(ValLoc, ValArg);
2139   } else
2140     return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2141 }
2142
2143 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2144 static void Passv64i1ArgInRegs(
2145     const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2146     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2147     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2148   assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2149          "Expected AVX512BW or AVX512BMI target!");
2150   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2151   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2152   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2153          "The value should reside in two registers");
2154
2155   // Before splitting the value we cast it to i64
2156   Arg = DAG.getBitcast(MVT::i64, Arg);
2157
2158   // Splitting the value into two i32 types
2159   SDValue Lo, Hi;
2160   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2161                    DAG.getConstant(0, Dl, MVT::i32));
2162   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2163                    DAG.getConstant(1, Dl, MVT::i32));
2164
2165   // Attach the two i32 types into corresponding registers
2166   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2167   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2168 }
2169
2170 SDValue
2171 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2172                                bool isVarArg,
2173                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2174                                const SmallVectorImpl<SDValue> &OutVals,
2175                                const SDLoc &dl, SelectionDAG &DAG) const {
2176   MachineFunction &MF = DAG.getMachineFunction();
2177   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2178
2179   // In some cases we need to disable registers from the default CSR list.
2180   // For example, when they are used for argument passing.
2181   bool ShouldDisableCalleeSavedRegister =
2182       CallConv == CallingConv::X86_RegCall ||
2183       MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2184
2185   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2186     report_fatal_error("X86 interrupts may not return any value");
2187
2188   SmallVector<CCValAssign, 16> RVLocs;
2189   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2190   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2191
2192   SDValue Flag;
2193   SmallVector<SDValue, 6> RetOps;
2194   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2195   // Operand #1 = Bytes To Pop
2196   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2197                    MVT::i32));
2198
2199   // Copy the result values into the output registers.
2200   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2201        ++I, ++OutsIndex) {
2202     CCValAssign &VA = RVLocs[I];
2203     assert(VA.isRegLoc() && "Can only return in registers!");
2204
2205     // Add the register to the CalleeSaveDisableRegs list.
2206     if (ShouldDisableCalleeSavedRegister)
2207       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2208
2209     SDValue ValToCopy = OutVals[OutsIndex];
2210     EVT ValVT = ValToCopy.getValueType();
2211
2212     // Promote values to the appropriate types.
2213     if (VA.getLocInfo() == CCValAssign::SExt)
2214       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2215     else if (VA.getLocInfo() == CCValAssign::ZExt)
2216       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2217     else if (VA.getLocInfo() == CCValAssign::AExt) {
2218       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2219         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2220       else
2221         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2222     }
2223     else if (VA.getLocInfo() == CCValAssign::BCvt)
2224       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2225
2226     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2227            "Unexpected FP-extend for return value.");
2228
2229     // If this is x86-64, and we disabled SSE, we can't return FP values,
2230     // or SSE or MMX vectors.
2231     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2232          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2233         (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2234       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2235       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2236     } else if (ValVT == MVT::f64 &&
2237                (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2238       // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2239       // llvm-gcc has never done it right and no one has noticed, so this
2240       // should be OK for now.
2241       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2242       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2243     }
2244
2245     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2246     // the RET instruction and handled by the FP Stackifier.
2247     if (VA.getLocReg() == X86::FP0 ||
2248         VA.getLocReg() == X86::FP1) {
2249       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2250       // change the value to the FP stack register class.
2251       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2252         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2253       RetOps.push_back(ValToCopy);
2254       // Don't emit a copytoreg.
2255       continue;
2256     }
2257
2258     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2259     // which is returned in RAX / RDX.
2260     if (Subtarget.is64Bit()) {
2261       if (ValVT == MVT::x86mmx) {
2262         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2263           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2264           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2265                                   ValToCopy);
2266           // If we don't have SSE2 available, convert to v4f32 so the generated
2267           // register is legal.
2268           if (!Subtarget.hasSSE2())
2269             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2270         }
2271       }
2272     }
2273
2274     SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2275
2276     if (VA.needsCustom()) {
2277       assert(VA.getValVT() == MVT::v64i1 &&
2278              "Currently the only custom case is when we split v64i1 to 2 regs");
2279
2280       Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2281                          Subtarget);
2282
2283       assert(2 == RegsToPass.size() &&
2284              "Expecting two registers after Pass64BitArgInRegs");
2285
2286       // Add the second register to the CalleeSaveDisableRegs list.
2287       if (ShouldDisableCalleeSavedRegister)
2288         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2289     } else {
2290       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2291     }
2292
2293     // Add nodes to the DAG and add the values into the RetOps list
2294     for (auto &Reg : RegsToPass) {
2295       Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2296       Flag = Chain.getValue(1);
2297       RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2298     }
2299   }
2300
2301   // Swift calling convention does not require we copy the sret argument
2302   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2303
2304   // All x86 ABIs require that for returning structs by value we copy
2305   // the sret argument into %rax/%eax (depending on ABI) for the return.
2306   // We saved the argument into a virtual register in the entry block,
2307   // so now we copy the value out and into %rax/%eax.
2308   //
2309   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2310   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2311   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2312   // either case FuncInfo->setSRetReturnReg() will have been called.
2313   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2314     // When we have both sret and another return value, we should use the
2315     // original Chain stored in RetOps[0], instead of the current Chain updated
2316     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2317
2318     // For the case of sret and another return value, we have
2319     //   Chain_0 at the function entry
2320     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2321     // If we use Chain_1 in getCopyFromReg, we will have
2322     //   Val = getCopyFromReg(Chain_1)
2323     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2324
2325     // getCopyToReg(Chain_0) will be glued together with
2326     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2327     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2328     //   Data dependency from Unit B to Unit A due to usage of Val in
2329     //     getCopyToReg(Chain_1, Val)
2330     //   Chain dependency from Unit A to Unit B
2331
2332     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2333     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2334                                      getPointerTy(MF.getDataLayout()));
2335
2336     unsigned RetValReg
2337         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2338           X86::RAX : X86::EAX;
2339     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2340     Flag = Chain.getValue(1);
2341
2342     // RAX/EAX now acts like a return value.
2343     RetOps.push_back(
2344         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2345
2346     // Add the returned register to the CalleeSaveDisableRegs list.
2347     if (ShouldDisableCalleeSavedRegister)
2348       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2349   }
2350
2351   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2352   const MCPhysReg *I =
2353       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2354   if (I) {
2355     for (; *I; ++I) {
2356       if (X86::GR64RegClass.contains(*I))
2357         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2358       else
2359         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2360     }
2361   }
2362
2363   RetOps[0] = Chain;  // Update chain.
2364
2365   // Add the flag if we have it.
2366   if (Flag.getNode())
2367     RetOps.push_back(Flag);
2368
2369   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2370   if (CallConv == CallingConv::X86_INTR)
2371     opcode = X86ISD::IRET;
2372   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2373 }
2374
2375 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2376   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2377     return false;
2378
2379   SDValue TCChain = Chain;
2380   SDNode *Copy = *N->use_begin();
2381   if (Copy->getOpcode() == ISD::CopyToReg) {
2382     // If the copy has a glue operand, we conservatively assume it isn't safe to
2383     // perform a tail call.
2384     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2385       return false;
2386     TCChain = Copy->getOperand(0);
2387   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2388     return false;
2389
2390   bool HasRet = false;
2391   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2392        UI != UE; ++UI) {
2393     if (UI->getOpcode() != X86ISD::RET_FLAG)
2394       return false;
2395     // If we are returning more than one value, we can definitely
2396     // not make a tail call see PR19530
2397     if (UI->getNumOperands() > 4)
2398       return false;
2399     if (UI->getNumOperands() == 4 &&
2400         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2401       return false;
2402     HasRet = true;
2403   }
2404
2405   if (!HasRet)
2406     return false;
2407
2408   Chain = TCChain;
2409   return true;
2410 }
2411
2412 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2413                                            ISD::NodeType ExtendKind) const {
2414   MVT ReturnMVT = MVT::i32;
2415
2416   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2417   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2418     // The ABI does not require i1, i8 or i16 to be extended.
2419     //
2420     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2421     // always extending i8/i16 return values, so keep doing that for now.
2422     // (PR26665).
2423     ReturnMVT = MVT::i8;
2424   }
2425
2426   EVT MinVT = getRegisterType(Context, ReturnMVT);
2427   return VT.bitsLT(MinVT) ? MinVT : VT;
2428 }
2429
2430 /// Reads two 32 bit registers and creates a 64 bit mask value.
2431 /// \param VA The current 32 bit value that need to be assigned.
2432 /// \param NextVA The next 32 bit value that need to be assigned.
2433 /// \param Root The parent DAG node.
2434 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2435 ///                        glue purposes. In the case the DAG is already using
2436 ///                        physical register instead of virtual, we should glue
2437 ///                        our new SDValue to InFlag SDvalue.
2438 /// \return a new SDvalue of size 64bit.
2439 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2440                                 SDValue &Root, SelectionDAG &DAG,
2441                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
2442                                 SDValue *InFlag = nullptr) {
2443   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2444   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2445   assert(VA.getValVT() == MVT::v64i1 &&
2446          "Expecting first location of 64 bit width type");
2447   assert(NextVA.getValVT() == VA.getValVT() &&
2448          "The locations should have the same type");
2449   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2450          "The values should reside in two registers");
2451
2452   SDValue Lo, Hi;
2453   unsigned Reg;
2454   SDValue ArgValueLo, ArgValueHi;
2455
2456   MachineFunction &MF = DAG.getMachineFunction();
2457   const TargetRegisterClass *RC = &X86::GR32RegClass;
2458
2459   // Read a 32 bit value from the registers
2460   if (nullptr == InFlag) {
2461     // When no physical register is present,
2462     // create an intermediate virtual register
2463     Reg = MF.addLiveIn(VA.getLocReg(), RC);
2464     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2465     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2466     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2467   } else {
2468     // When a physical register is available read the value from it and glue
2469     // the reads together.
2470     ArgValueLo =
2471       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2472     *InFlag = ArgValueLo.getValue(2);
2473     ArgValueHi =
2474       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2475     *InFlag = ArgValueHi.getValue(2);
2476   }
2477
2478   // Convert the i32 type into v32i1 type
2479   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2480
2481   // Convert the i32 type into v32i1 type
2482   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2483
2484   // Concatenate the two values together
2485   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2486 }
2487
2488 /// The function will lower a register of various sizes (8/16/32/64)
2489 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2490 /// \returns a DAG node contains the operand after lowering to mask type.
2491 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2492                                const EVT &ValLoc, const SDLoc &Dl,
2493                                SelectionDAG &DAG) {
2494   SDValue ValReturned = ValArg;
2495
2496   if (ValVT == MVT::v1i1)
2497     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2498
2499   if (ValVT == MVT::v64i1) {
2500     // In 32 bit machine, this case is handled by getv64i1Argument
2501     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2502     // In 64 bit machine, There is no need to truncate the value only bitcast
2503   } else {
2504     MVT maskLen;
2505     switch (ValVT.getSimpleVT().SimpleTy) {
2506     case MVT::v8i1:
2507       maskLen = MVT::i8;
2508       break;
2509     case MVT::v16i1:
2510       maskLen = MVT::i16;
2511       break;
2512     case MVT::v32i1:
2513       maskLen = MVT::i32;
2514       break;
2515     default:
2516       llvm_unreachable("Expecting a vector of i1 types");
2517     }
2518
2519     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2520   }
2521   return DAG.getBitcast(ValVT, ValReturned);
2522 }
2523
2524 /// Lower the result values of a call into the
2525 /// appropriate copies out of appropriate physical registers.
2526 ///
2527 SDValue X86TargetLowering::LowerCallResult(
2528     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2529     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2530     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2531     uint32_t *RegMask) const {
2532
2533   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2534   // Assign locations to each value returned by this call.
2535   SmallVector<CCValAssign, 16> RVLocs;
2536   bool Is64Bit = Subtarget.is64Bit();
2537   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2538                  *DAG.getContext());
2539   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2540
2541   // Copy all of the result registers out of their specified physreg.
2542   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2543        ++I, ++InsIndex) {
2544     CCValAssign &VA = RVLocs[I];
2545     EVT CopyVT = VA.getLocVT();
2546
2547     // In some calling conventions we need to remove the used registers
2548     // from the register mask.
2549     if (RegMask) {
2550       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2551            SubRegs.isValid(); ++SubRegs)
2552         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2553     }
2554
2555     // If this is x86-64, and we disabled SSE, we can't return FP values
2556     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2557         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2558       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2559       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2560     }
2561
2562     // If we prefer to use the value in xmm registers, copy it out as f80 and
2563     // use a truncate to move it from fp stack reg to xmm reg.
2564     bool RoundAfterCopy = false;
2565     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2566         isScalarFPTypeInSSEReg(VA.getValVT())) {
2567       if (!Subtarget.hasX87())
2568         report_fatal_error("X87 register return with X87 disabled");
2569       CopyVT = MVT::f80;
2570       RoundAfterCopy = (CopyVT != VA.getLocVT());
2571     }
2572
2573     SDValue Val;
2574     if (VA.needsCustom()) {
2575       assert(VA.getValVT() == MVT::v64i1 &&
2576              "Currently the only custom case is when we split v64i1 to 2 regs");
2577       Val =
2578           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2579     } else {
2580       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2581                   .getValue(1);
2582       Val = Chain.getValue(0);
2583       InFlag = Chain.getValue(2);
2584     }
2585
2586     if (RoundAfterCopy)
2587       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2588                         // This truncation won't change the value.
2589                         DAG.getIntPtrConstant(1, dl));
2590
2591     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2592       if (VA.getValVT().isVector() &&
2593           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2594            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2595         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2596         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2597       } else
2598         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2599     }
2600
2601     InVals.push_back(Val);
2602   }
2603
2604   return Chain;
2605 }
2606
2607 //===----------------------------------------------------------------------===//
2608 //                C & StdCall & Fast Calling Convention implementation
2609 //===----------------------------------------------------------------------===//
2610 //  StdCall calling convention seems to be standard for many Windows' API
2611 //  routines and around. It differs from C calling convention just a little:
2612 //  callee should clean up the stack, not caller. Symbols should be also
2613 //  decorated in some fancy way :) It doesn't support any vector arguments.
2614 //  For info on fast calling convention see Fast Calling Convention (tail call)
2615 //  implementation LowerX86_32FastCCCallTo.
2616
2617 /// CallIsStructReturn - Determines whether a call uses struct return
2618 /// semantics.
2619 enum StructReturnType {
2620   NotStructReturn,
2621   RegStructReturn,
2622   StackStructReturn
2623 };
2624 static StructReturnType
2625 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2626   if (Outs.empty())
2627     return NotStructReturn;
2628
2629   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2630   if (!Flags.isSRet())
2631     return NotStructReturn;
2632   if (Flags.isInReg() || IsMCU)
2633     return RegStructReturn;
2634   return StackStructReturn;
2635 }
2636
2637 /// Determines whether a function uses struct return semantics.
2638 static StructReturnType
2639 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2640   if (Ins.empty())
2641     return NotStructReturn;
2642
2643   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2644   if (!Flags.isSRet())
2645     return NotStructReturn;
2646   if (Flags.isInReg() || IsMCU)
2647     return RegStructReturn;
2648   return StackStructReturn;
2649 }
2650
2651 /// Make a copy of an aggregate at address specified by "Src" to address
2652 /// "Dst" with size and alignment information specified by the specific
2653 /// parameter attribute. The copy will be passed as a byval function parameter.
2654 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2655                                          SDValue Chain, ISD::ArgFlagsTy Flags,
2656                                          SelectionDAG &DAG, const SDLoc &dl) {
2657   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2658
2659   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2660                        /*isVolatile*/false, /*AlwaysInline=*/true,
2661                        /*isTailCall*/false,
2662                        MachinePointerInfo(), MachinePointerInfo());
2663 }
2664
2665 /// Return true if the calling convention is one that we can guarantee TCO for.
2666 static bool canGuaranteeTCO(CallingConv::ID CC) {
2667   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2668           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2669           CC == CallingConv::HHVM);
2670 }
2671
2672 /// Return true if we might ever do TCO for calls with this calling convention.
2673 static bool mayTailCallThisCC(CallingConv::ID CC) {
2674   switch (CC) {
2675   // C calling conventions:
2676   case CallingConv::C:
2677   case CallingConv::Win64:
2678   case CallingConv::X86_64_SysV:
2679   // Callee pop conventions:
2680   case CallingConv::X86_ThisCall:
2681   case CallingConv::X86_StdCall:
2682   case CallingConv::X86_VectorCall:
2683   case CallingConv::X86_FastCall:
2684     return true;
2685   default:
2686     return canGuaranteeTCO(CC);
2687   }
2688 }
2689
2690 /// Return true if the function is being made into a tailcall target by
2691 /// changing its ABI.
2692 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2693   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2694 }
2695
2696 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2697   auto Attr =
2698       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2699   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2700     return false;
2701
2702   ImmutableCallSite CS(CI);
2703   CallingConv::ID CalleeCC = CS.getCallingConv();
2704   if (!mayTailCallThisCC(CalleeCC))
2705     return false;
2706
2707   return true;
2708 }
2709
2710 SDValue
2711 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2712                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2713                                     const SDLoc &dl, SelectionDAG &DAG,
2714                                     const CCValAssign &VA,
2715                                     MachineFrameInfo &MFI, unsigned i) const {
2716   // Create the nodes corresponding to a load from this parameter slot.
2717   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2718   bool AlwaysUseMutable = shouldGuaranteeTCO(
2719       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2720   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2721   EVT ValVT;
2722   MVT PtrVT = getPointerTy(DAG.getDataLayout());
2723
2724   // If value is passed by pointer we have address passed instead of the value
2725   // itself. No need to extend if the mask value and location share the same
2726   // absolute size.
2727   bool ExtendedInMem =
2728       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2729       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2730
2731   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2732     ValVT = VA.getLocVT();
2733   else
2734     ValVT = VA.getValVT();
2735
2736   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2737   // taken by a return address.
2738   int Offset = 0;
2739   if (CallConv == CallingConv::X86_INTR) {
2740     // X86 interrupts may take one or two arguments.
2741     // On the stack there will be no return address as in regular call.
2742     // Offset of last argument need to be set to -4/-8 bytes.
2743     // Where offset of the first argument out of two, should be set to 0 bytes.
2744     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2745     if (Subtarget.is64Bit() && Ins.size() == 2) {
2746       // The stack pointer needs to be realigned for 64 bit handlers with error
2747       // code, so the argument offset changes by 8 bytes.
2748       Offset += 8;
2749     }
2750   }
2751
2752   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2753   // changed with more analysis.
2754   // In case of tail call optimization mark all arguments mutable. Since they
2755   // could be overwritten by lowering of arguments in case of a tail call.
2756   if (Flags.isByVal()) {
2757     unsigned Bytes = Flags.getByValSize();
2758     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2759     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2760     // Adjust SP offset of interrupt parameter.
2761     if (CallConv == CallingConv::X86_INTR) {
2762       MFI.setObjectOffset(FI, Offset);
2763     }
2764     return DAG.getFrameIndex(FI, PtrVT);
2765   }
2766
2767   // This is an argument in memory. We might be able to perform copy elision.
2768   if (Flags.isCopyElisionCandidate()) {
2769     EVT ArgVT = Ins[i].ArgVT;
2770     SDValue PartAddr;
2771     if (Ins[i].PartOffset == 0) {
2772       // If this is a one-part value or the first part of a multi-part value,
2773       // create a stack object for the entire argument value type and return a
2774       // load from our portion of it. This assumes that if the first part of an
2775       // argument is in memory, the rest will also be in memory.
2776       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2777                                      /*Immutable=*/false);
2778       PartAddr = DAG.getFrameIndex(FI, PtrVT);
2779       return DAG.getLoad(
2780           ValVT, dl, Chain, PartAddr,
2781           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2782     } else {
2783       // This is not the first piece of an argument in memory. See if there is
2784       // already a fixed stack object including this offset. If so, assume it
2785       // was created by the PartOffset == 0 branch above and create a load from
2786       // the appropriate offset into it.
2787       int64_t PartBegin = VA.getLocMemOffset();
2788       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2789       int FI = MFI.getObjectIndexBegin();
2790       for (; MFI.isFixedObjectIndex(FI); ++FI) {
2791         int64_t ObjBegin = MFI.getObjectOffset(FI);
2792         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2793         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2794           break;
2795       }
2796       if (MFI.isFixedObjectIndex(FI)) {
2797         SDValue Addr =
2798             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2799                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2800         return DAG.getLoad(
2801             ValVT, dl, Chain, Addr,
2802             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2803                                               Ins[i].PartOffset));
2804       }
2805     }
2806   }
2807
2808   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2809                                  VA.getLocMemOffset(), isImmutable);
2810
2811   // Set SExt or ZExt flag.
2812   if (VA.getLocInfo() == CCValAssign::ZExt) {
2813     MFI.setObjectZExt(FI, true);
2814   } else if (VA.getLocInfo() == CCValAssign::SExt) {
2815     MFI.setObjectSExt(FI, true);
2816   }
2817
2818   // Adjust SP offset of interrupt parameter.
2819   if (CallConv == CallingConv::X86_INTR) {
2820     MFI.setObjectOffset(FI, Offset);
2821   }
2822
2823   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2824   SDValue Val = DAG.getLoad(
2825       ValVT, dl, Chain, FIN,
2826       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2827   return ExtendedInMem
2828              ? (VA.getValVT().isVector()
2829                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2830                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2831              : Val;
2832 }
2833
2834 // FIXME: Get this from tablegen.
2835 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2836                                                 const X86Subtarget &Subtarget) {
2837   assert(Subtarget.is64Bit());
2838
2839   if (Subtarget.isCallingConvWin64(CallConv)) {
2840     static const MCPhysReg GPR64ArgRegsWin64[] = {
2841       X86::RCX, X86::RDX, X86::R8,  X86::R9
2842     };
2843     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2844   }
2845
2846   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2847     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2848   };
2849   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2850 }
2851
2852 // FIXME: Get this from tablegen.
2853 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2854                                                 CallingConv::ID CallConv,
2855                                                 const X86Subtarget &Subtarget) {
2856   assert(Subtarget.is64Bit());
2857   if (Subtarget.isCallingConvWin64(CallConv)) {
2858     // The XMM registers which might contain var arg parameters are shadowed
2859     // in their paired GPR.  So we only need to save the GPR to their home
2860     // slots.
2861     // TODO: __vectorcall will change this.
2862     return None;
2863   }
2864
2865   const Function *Fn = MF.getFunction();
2866   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2867   bool isSoftFloat = Subtarget.useSoftFloat();
2868   assert(!(isSoftFloat && NoImplicitFloatOps) &&
2869          "SSE register cannot be used when SSE is disabled!");
2870   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2871     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2872     // registers.
2873     return None;
2874
2875   static const MCPhysReg XMMArgRegs64Bit[] = {
2876     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2877     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2878   };
2879   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2880 }
2881
2882 #ifndef NDEBUG
2883 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2884   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2885                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
2886                           return A.getValNo() < B.getValNo();
2887                         });
2888 }
2889 #endif
2890
2891 SDValue X86TargetLowering::LowerFormalArguments(
2892     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2893     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2894     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2895   MachineFunction &MF = DAG.getMachineFunction();
2896   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2897   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2898
2899   const Function *Fn = MF.getFunction();
2900   if (Fn->hasExternalLinkage() &&
2901       Subtarget.isTargetCygMing() &&
2902       Fn->getName() == "main")
2903     FuncInfo->setForceFramePointer(true);
2904
2905   MachineFrameInfo &MFI = MF.getFrameInfo();
2906   bool Is64Bit = Subtarget.is64Bit();
2907   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2908
2909   assert(
2910       !(isVarArg && canGuaranteeTCO(CallConv)) &&
2911       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2912
2913   if (CallConv == CallingConv::X86_INTR) {
2914     bool isLegal = Ins.size() == 1 ||
2915                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2916                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
2917     if (!isLegal)
2918       report_fatal_error("X86 interrupts may take one or two arguments");
2919   }
2920
2921   // Assign locations to all of the incoming arguments.
2922   SmallVector<CCValAssign, 16> ArgLocs;
2923   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2924
2925   // Allocate shadow area for Win64.
2926   if (IsWin64)
2927     CCInfo.AllocateStack(32, 8);
2928
2929   CCInfo.AnalyzeArguments(Ins, CC_X86);
2930
2931   // In vectorcall calling convention a second pass is required for the HVA
2932   // types.
2933   if (CallingConv::X86_VectorCall == CallConv) {
2934     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2935   }
2936
2937   // The next loop assumes that the locations are in the same order of the
2938   // input arguments.
2939   assert(isSortedByValueNo(ArgLocs) &&
2940          "Argument Location list must be sorted before lowering");
2941
2942   SDValue ArgValue;
2943   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2944        ++I, ++InsIndex) {
2945     assert(InsIndex < Ins.size() && "Invalid Ins index");
2946     CCValAssign &VA = ArgLocs[I];
2947
2948     if (VA.isRegLoc()) {
2949       EVT RegVT = VA.getLocVT();
2950       if (VA.needsCustom()) {
2951         assert(
2952             VA.getValVT() == MVT::v64i1 &&
2953             "Currently the only custom case is when we split v64i1 to 2 regs");
2954
2955         // v64i1 values, in regcall calling convention, that are
2956         // compiled to 32 bit arch, are split up into two registers.
2957         ArgValue =
2958             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2959       } else {
2960         const TargetRegisterClass *RC;
2961         if (RegVT == MVT::i32)
2962           RC = &X86::GR32RegClass;
2963         else if (Is64Bit && RegVT == MVT::i64)
2964           RC = &X86::GR64RegClass;
2965         else if (RegVT == MVT::f32)
2966           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2967         else if (RegVT == MVT::f64)
2968           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2969         else if (RegVT == MVT::f80)
2970           RC = &X86::RFP80RegClass;
2971         else if (RegVT == MVT::f128)
2972           RC = &X86::FR128RegClass;
2973         else if (RegVT.is512BitVector())
2974           RC = &X86::VR512RegClass;
2975         else if (RegVT.is256BitVector())
2976           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2977         else if (RegVT.is128BitVector())
2978           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2979         else if (RegVT == MVT::x86mmx)
2980           RC = &X86::VR64RegClass;
2981         else if (RegVT == MVT::v1i1)
2982           RC = &X86::VK1RegClass;
2983         else if (RegVT == MVT::v8i1)
2984           RC = &X86::VK8RegClass;
2985         else if (RegVT == MVT::v16i1)
2986           RC = &X86::VK16RegClass;
2987         else if (RegVT == MVT::v32i1)
2988           RC = &X86::VK32RegClass;
2989         else if (RegVT == MVT::v64i1)
2990           RC = &X86::VK64RegClass;
2991         else
2992           llvm_unreachable("Unknown argument type!");
2993
2994         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2995         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2996       }
2997
2998       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2999       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3000       // right size.
3001       if (VA.getLocInfo() == CCValAssign::SExt)
3002         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3003                                DAG.getValueType(VA.getValVT()));
3004       else if (VA.getLocInfo() == CCValAssign::ZExt)
3005         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3006                                DAG.getValueType(VA.getValVT()));
3007       else if (VA.getLocInfo() == CCValAssign::BCvt)
3008         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3009
3010       if (VA.isExtInLoc()) {
3011         // Handle MMX values passed in XMM regs.
3012         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3013           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3014         else if (VA.getValVT().isVector() &&
3015                  VA.getValVT().getScalarType() == MVT::i1 &&
3016                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3017                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3018           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3019           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3020         } else
3021           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3022       }
3023     } else {
3024       assert(VA.isMemLoc());
3025       ArgValue =
3026           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3027     }
3028
3029     // If value is passed via pointer - do a load.
3030     if (VA.getLocInfo() == CCValAssign::Indirect)
3031       ArgValue =
3032           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3033
3034     InVals.push_back(ArgValue);
3035   }
3036
3037   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3038     // Swift calling convention does not require we copy the sret argument
3039     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3040     if (CallConv == CallingConv::Swift)
3041       continue;
3042
3043     // All x86 ABIs require that for returning structs by value we copy the
3044     // sret argument into %rax/%eax (depending on ABI) for the return. Save
3045     // the argument into a virtual register so that we can access it from the
3046     // return points.
3047     if (Ins[I].Flags.isSRet()) {
3048       unsigned Reg = FuncInfo->getSRetReturnReg();
3049       if (!Reg) {
3050         MVT PtrTy = getPointerTy(DAG.getDataLayout());
3051         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3052         FuncInfo->setSRetReturnReg(Reg);
3053       }
3054       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3055       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3056       break;
3057     }
3058   }
3059
3060   unsigned StackSize = CCInfo.getNextStackOffset();
3061   // Align stack specially for tail calls.
3062   if (shouldGuaranteeTCO(CallConv,
3063                          MF.getTarget().Options.GuaranteedTailCallOpt))
3064     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3065
3066   // If the function takes variable number of arguments, make a frame index for
3067   // the start of the first vararg value... for expansion of llvm.va_start. We
3068   // can skip this if there are no va_start calls.
3069   if (MFI.hasVAStart() &&
3070       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3071                    CallConv != CallingConv::X86_ThisCall))) {
3072     FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3073   }
3074
3075   // Figure out if XMM registers are in use.
3076   assert(!(Subtarget.useSoftFloat() &&
3077            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3078          "SSE register cannot be used when SSE is disabled!");
3079
3080   // 64-bit calling conventions support varargs and register parameters, so we
3081   // have to do extra work to spill them in the prologue.
3082   if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3083     // Find the first unallocated argument registers.
3084     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3085     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3086     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3087     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3088     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3089            "SSE register cannot be used when SSE is disabled!");
3090
3091     // Gather all the live in physical registers.
3092     SmallVector<SDValue, 6> LiveGPRs;
3093     SmallVector<SDValue, 8> LiveXMMRegs;
3094     SDValue ALVal;
3095     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3096       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3097       LiveGPRs.push_back(
3098           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3099     }
3100     if (!ArgXMMs.empty()) {
3101       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3102       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3103       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3104         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3105         LiveXMMRegs.push_back(
3106             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3107       }
3108     }
3109
3110     if (IsWin64) {
3111       // Get to the caller-allocated home save location.  Add 8 to account
3112       // for the return address.
3113       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3114       FuncInfo->setRegSaveFrameIndex(
3115           MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3116       // Fixup to set vararg frame on shadow area (4 x i64).
3117       if (NumIntRegs < 4)
3118         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3119     } else {
3120       // For X86-64, if there are vararg parameters that are passed via
3121       // registers, then we must store them to their spots on the stack so
3122       // they may be loaded by dereferencing the result of va_next.
3123       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3124       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3125       FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3126           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3127     }
3128
3129     // Store the integer parameter registers.
3130     SmallVector<SDValue, 8> MemOps;
3131     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3132                                       getPointerTy(DAG.getDataLayout()));
3133     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3134     for (SDValue Val : LiveGPRs) {
3135       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3136                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3137       SDValue Store =
3138           DAG.getStore(Val.getValue(1), dl, Val, FIN,
3139                        MachinePointerInfo::getFixedStack(
3140                            DAG.getMachineFunction(),
3141                            FuncInfo->getRegSaveFrameIndex(), Offset));
3142       MemOps.push_back(Store);
3143       Offset += 8;
3144     }
3145
3146     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3147       // Now store the XMM (fp + vector) parameter registers.
3148       SmallVector<SDValue, 12> SaveXMMOps;
3149       SaveXMMOps.push_back(Chain);
3150       SaveXMMOps.push_back(ALVal);
3151       SaveXMMOps.push_back(DAG.getIntPtrConstant(
3152                              FuncInfo->getRegSaveFrameIndex(), dl));
3153       SaveXMMOps.push_back(DAG.getIntPtrConstant(
3154                              FuncInfo->getVarArgsFPOffset(), dl));
3155       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3156                         LiveXMMRegs.end());
3157       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3158                                    MVT::Other, SaveXMMOps));
3159     }
3160
3161     if (!MemOps.empty())
3162       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3163   }
3164
3165   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3166     // Find the largest legal vector type.
3167     MVT VecVT = MVT::Other;
3168     // FIXME: Only some x86_32 calling conventions support AVX512.
3169     if (Subtarget.hasAVX512() &&
3170         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3171                      CallConv == CallingConv::Intel_OCL_BI)))
3172       VecVT = MVT::v16f32;
3173     else if (Subtarget.hasAVX())
3174       VecVT = MVT::v8f32;
3175     else if (Subtarget.hasSSE2())
3176       VecVT = MVT::v4f32;
3177
3178     // We forward some GPRs and some vector types.
3179     SmallVector<MVT, 2> RegParmTypes;
3180     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3181     RegParmTypes.push_back(IntVT);
3182     if (VecVT != MVT::Other)
3183       RegParmTypes.push_back(VecVT);
3184
3185     // Compute the set of forwarded registers. The rest are scratch.
3186     SmallVectorImpl<ForwardedRegister> &Forwards =
3187         FuncInfo->getForwardedMustTailRegParms();
3188     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3189
3190     // Conservatively forward AL on x86_64, since it might be used for varargs.
3191     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3192       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3193       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3194     }
3195
3196     // Copy all forwards from physical to virtual registers.
3197     for (ForwardedRegister &F : Forwards) {
3198       // FIXME: Can we use a less constrained schedule?
3199       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3200       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3201       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3202     }
3203   }
3204
3205   // Some CCs need callee pop.
3206   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3207                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
3208     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3209   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3210     // X86 interrupts must pop the error code (and the alignment padding) if
3211     // present.
3212     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3213   } else {
3214     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3215     // If this is an sret function, the return should pop the hidden pointer.
3216     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3217         !Subtarget.getTargetTriple().isOSMSVCRT() &&
3218         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3219       FuncInfo->setBytesToPopOnReturn(4);
3220   }
3221
3222   if (!Is64Bit) {
3223     // RegSaveFrameIndex is X86-64 only.
3224     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3225     if (CallConv == CallingConv::X86_FastCall ||
3226         CallConv == CallingConv::X86_ThisCall)
3227       // fastcc functions can't have varargs.
3228       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3229   }
3230
3231   FuncInfo->setArgumentStackSize(StackSize);
3232
3233   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3234     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3235     if (Personality == EHPersonality::CoreCLR) {
3236       assert(Is64Bit);
3237       // TODO: Add a mechanism to frame lowering that will allow us to indicate
3238       // that we'd prefer this slot be allocated towards the bottom of the frame
3239       // (i.e. near the stack pointer after allocating the frame).  Every
3240       // funclet needs a copy of this slot in its (mostly empty) frame, and the
3241       // offset from the bottom of this and each funclet's frame must be the
3242       // same, so the size of funclets' (mostly empty) frames is dictated by
3243       // how far this slot is from the bottom (since they allocate just enough
3244       // space to accommodate holding this slot at the correct offset).
3245       int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3246       EHInfo->PSPSymFrameIdx = PSPSymFI;
3247     }
3248   }
3249
3250   if (CallConv == CallingConv::X86_RegCall ||
3251       Fn->hasFnAttribute("no_caller_saved_registers")) {
3252     const MachineRegisterInfo &MRI = MF.getRegInfo();
3253     for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3254       MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3255   }
3256
3257   return Chain;
3258 }
3259
3260 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3261                                             SDValue Arg, const SDLoc &dl,
3262                                             SelectionDAG &DAG,
3263                                             const CCValAssign &VA,
3264                                             ISD::ArgFlagsTy Flags) const {
3265   unsigned LocMemOffset = VA.getLocMemOffset();
3266   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3267   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3268                        StackPtr, PtrOff);
3269   if (Flags.isByVal())
3270     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3271
3272   return DAG.getStore(
3273       Chain, dl, Arg, PtrOff,
3274       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3275 }
3276
3277 /// Emit a load of return address if tail call
3278 /// optimization is performed and it is required.
3279 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3280     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3281     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3282   // Adjust the Return address stack slot.
3283   EVT VT = getPointerTy(DAG.getDataLayout());
3284   OutRetAddr = getReturnAddressFrameIndex(DAG);
3285
3286   // Load the "old" Return address.
3287   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3288   return SDValue(OutRetAddr.getNode(), 1);
3289 }
3290
3291 /// Emit a store of the return address if tail call
3292 /// optimization is performed and it is required (FPDiff!=0).
3293 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3294                                         SDValue Chain, SDValue RetAddrFrIdx,
3295                                         EVT PtrVT, unsigned SlotSize,
3296                                         int FPDiff, const SDLoc &dl) {
3297   // Store the return address to the appropriate stack slot.
3298   if (!FPDiff) return Chain;
3299   // Calculate the new stack slot for the return address.
3300   int NewReturnAddrFI =
3301     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3302                                          false);
3303   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3304   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3305                        MachinePointerInfo::getFixedStack(
3306                            DAG.getMachineFunction(), NewReturnAddrFI));
3307   return Chain;
3308 }
3309
3310 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3311 /// operation of specified width.
3312 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3313                        SDValue V2) {
3314   unsigned NumElems = VT.getVectorNumElements();
3315   SmallVector<int, 8> Mask;
3316   Mask.push_back(NumElems);
3317   for (unsigned i = 1; i != NumElems; ++i)
3318     Mask.push_back(i);
3319   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3320 }
3321
3322 SDValue
3323 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3324                              SmallVectorImpl<SDValue> &InVals) const {
3325   SelectionDAG &DAG                     = CLI.DAG;
3326   SDLoc &dl                             = CLI.DL;
3327   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3328   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3329   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3330   SDValue Chain                         = CLI.Chain;
3331   SDValue Callee                        = CLI.Callee;
3332   CallingConv::ID CallConv              = CLI.CallConv;
3333   bool &isTailCall                      = CLI.IsTailCall;
3334   bool isVarArg                         = CLI.IsVarArg;
3335
3336   MachineFunction &MF = DAG.getMachineFunction();
3337   bool Is64Bit        = Subtarget.is64Bit();
3338   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3339   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3340   bool IsSibcall      = false;
3341   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3342   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3343   const CallInst *CI =
3344       CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3345   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3346   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3347                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3348
3349   if (CallConv == CallingConv::X86_INTR)
3350     report_fatal_error("X86 interrupts may not be called directly");
3351
3352   if (Attr.getValueAsString() == "true")
3353     isTailCall = false;
3354
3355   if (Subtarget.isPICStyleGOT() &&
3356       !MF.getTarget().Options.GuaranteedTailCallOpt) {
3357     // If we are using a GOT, disable tail calls to external symbols with
3358     // default visibility. Tail calling such a symbol requires using a GOT
3359     // relocation, which forces early binding of the symbol. This breaks code
3360     // that require lazy function symbol resolution. Using musttail or
3361     // GuaranteedTailCallOpt will override this.
3362     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3363     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3364                G->getGlobal()->hasDefaultVisibility()))
3365       isTailCall = false;
3366   }
3367
3368   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3369   if (IsMustTail) {
3370     // Force this to be a tail call.  The verifier rules are enough to ensure
3371     // that we can lower this successfully without moving the return address
3372     // around.
3373     isTailCall = true;
3374   } else if (isTailCall) {
3375     // Check if it's really possible to do a tail call.
3376     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3377                     isVarArg, SR != NotStructReturn,
3378                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3379                     Outs, OutVals, Ins, DAG);
3380
3381     // Sibcalls are automatically detected tailcalls which do not require
3382     // ABI changes.
3383     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3384       IsSibcall = true;
3385
3386     if (isTailCall)
3387       ++NumTailCalls;
3388   }
3389
3390   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3391          "Var args not supported with calling convention fastcc, ghc or hipe");
3392
3393   // Analyze operands of the call, assigning locations to each operand.
3394   SmallVector<CCValAssign, 16> ArgLocs;
3395   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3396
3397   // Allocate shadow area for Win64.
3398   if (IsWin64)
3399     CCInfo.AllocateStack(32, 8);
3400
3401   CCInfo.AnalyzeArguments(Outs, CC_X86);
3402
3403   // In vectorcall calling convention a second pass is required for the HVA
3404   // types.
3405   if (CallingConv::X86_VectorCall == CallConv) {
3406     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3407   }
3408
3409   // Get a count of how many bytes are to be pushed on the stack.
3410   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3411   if (IsSibcall)
3412     // This is a sibcall. The memory operands are available in caller's
3413     // own caller's stack.
3414     NumBytes = 0;
3415   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3416            canGuaranteeTCO(CallConv))
3417     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3418
3419   int FPDiff = 0;
3420   if (isTailCall && !IsSibcall && !IsMustTail) {
3421     // Lower arguments at fp - stackoffset + fpdiff.
3422     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3423
3424     FPDiff = NumBytesCallerPushed - NumBytes;
3425
3426     // Set the delta of movement of the returnaddr stackslot.
3427     // But only set if delta is greater than previous delta.
3428     if (FPDiff < X86Info->getTCReturnAddrDelta())
3429       X86Info->setTCReturnAddrDelta(FPDiff);
3430   }
3431
3432   unsigned NumBytesToPush = NumBytes;
3433   unsigned NumBytesToPop = NumBytes;
3434
3435   // If we have an inalloca argument, all stack space has already been allocated
3436   // for us and be right at the top of the stack.  We don't support multiple
3437   // arguments passed in memory when using inalloca.
3438   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3439     NumBytesToPush = 0;
3440     if (!ArgLocs.back().isMemLoc())
3441       report_fatal_error("cannot use inalloca attribute on a register "
3442                          "parameter");
3443     if (ArgLocs.back().getLocMemOffset() != 0)
3444       report_fatal_error("any parameter with the inalloca attribute must be "
3445                          "the only memory argument");
3446   }
3447
3448   if (!IsSibcall)
3449     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3450                                  NumBytes - NumBytesToPush, dl);
3451
3452   SDValue RetAddrFrIdx;
3453   // Load return address for tail calls.
3454   if (isTailCall && FPDiff)
3455     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3456                                     Is64Bit, FPDiff, dl);
3457
3458   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3459   SmallVector<SDValue, 8> MemOpChains;
3460   SDValue StackPtr;
3461
3462   // The next loop assumes that the locations are in the same order of the
3463   // input arguments.
3464   assert(isSortedByValueNo(ArgLocs) &&
3465          "Argument Location list must be sorted before lowering");
3466
3467   // Walk the register/memloc assignments, inserting copies/loads.  In the case
3468   // of tail call optimization arguments are handle later.
3469   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3470   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3471        ++I, ++OutIndex) {
3472     assert(OutIndex < Outs.size() && "Invalid Out index");
3473     // Skip inalloca arguments, they have already been written.
3474     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3475     if (Flags.isInAlloca())
3476       continue;
3477
3478     CCValAssign &VA = ArgLocs[I];
3479     EVT RegVT = VA.getLocVT();
3480     SDValue Arg = OutVals[OutIndex];
3481     bool isByVal = Flags.isByVal();
3482
3483     // Promote the value if needed.
3484     switch (VA.getLocInfo()) {
3485     default: llvm_unreachable("Unknown loc info!");
3486     case CCValAssign::Full: break;
3487     case CCValAssign::SExt:
3488       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3489       break;
3490     case CCValAssign::ZExt:
3491       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3492       break;
3493     case CCValAssign::AExt:
3494       if (Arg.getValueType().isVector() &&
3495           Arg.getValueType().getVectorElementType() == MVT::i1)
3496         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3497       else if (RegVT.is128BitVector()) {
3498         // Special case: passing MMX values in XMM registers.
3499         Arg = DAG.getBitcast(MVT::i64, Arg);
3500         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3501         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3502       } else
3503         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3504       break;
3505     case CCValAssign::BCvt:
3506       Arg = DAG.getBitcast(RegVT, Arg);
3507       break;
3508     case CCValAssign::Indirect: {
3509       // Store the argument.
3510       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3511       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3512       Chain = DAG.getStore(
3513           Chain, dl, Arg, SpillSlot,
3514           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3515       Arg = SpillSlot;
3516       break;
3517     }
3518     }
3519
3520     if (VA.needsCustom()) {
3521       assert(VA.getValVT() == MVT::v64i1 &&
3522              "Currently the only custom case is when we split v64i1 to 2 regs");
3523       // Split v64i1 value into two registers
3524       Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3525                          Subtarget);
3526     } else if (VA.isRegLoc()) {
3527       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3528       if (isVarArg && IsWin64) {
3529         // Win64 ABI requires argument XMM reg to be copied to the corresponding
3530         // shadow reg if callee is a varargs function.
3531         unsigned ShadowReg = 0;
3532         switch (VA.getLocReg()) {
3533         case X86::XMM0: ShadowReg = X86::RCX; break;
3534         case X86::XMM1: ShadowReg = X86::RDX; break;
3535         case X86::XMM2: ShadowReg = X86::R8; break;
3536         case X86::XMM3: ShadowReg = X86::R9; break;
3537         }
3538         if (ShadowReg)
3539           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3540       }
3541     } else if (!IsSibcall && (!isTailCall || isByVal)) {
3542       assert(VA.isMemLoc());
3543       if (!StackPtr.getNode())
3544         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3545                                       getPointerTy(DAG.getDataLayout()));
3546       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3547                                              dl, DAG, VA, Flags));
3548     }
3549   }
3550
3551   if (!MemOpChains.empty())
3552     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3553
3554   if (Subtarget.isPICStyleGOT()) {
3555     // ELF / PIC requires GOT in the EBX register before function calls via PLT
3556     // GOT pointer.
3557     if (!isTailCall) {
3558       RegsToPass.push_back(std::make_pair(
3559           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3560                                           getPointerTy(DAG.getDataLayout()))));
3561     } else {
3562       // If we are tail calling and generating PIC/GOT style code load the
3563       // address of the callee into ECX. The value in ecx is used as target of
3564       // the tail jump. This is done to circumvent the ebx/callee-saved problem
3565       // for tail calls on PIC/GOT architectures. Normally we would just put the
3566       // address of GOT into ebx and then call target@PLT. But for tail calls
3567       // ebx would be restored (since ebx is callee saved) before jumping to the
3568       // target@PLT.
3569
3570       // Note: The actual moving to ECX is done further down.
3571       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3572       if (G && !G->getGlobal()->hasLocalLinkage() &&
3573           G->getGlobal()->hasDefaultVisibility())
3574         Callee = LowerGlobalAddress(Callee, DAG);
3575       else if (isa<ExternalSymbolSDNode>(Callee))
3576         Callee = LowerExternalSymbol(Callee, DAG);
3577     }
3578   }
3579
3580   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3581     // From AMD64 ABI document:
3582     // For calls that may call functions that use varargs or stdargs
3583     // (prototype-less calls or calls to functions containing ellipsis (...) in
3584     // the declaration) %al is used as hidden argument to specify the number
3585     // of SSE registers used. The contents of %al do not need to match exactly
3586     // the number of registers, but must be an ubound on the number of SSE
3587     // registers used and is in the range 0 - 8 inclusive.
3588
3589     // Count the number of XMM registers allocated.
3590     static const MCPhysReg XMMArgRegs[] = {
3591       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3592       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3593     };
3594     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3595     assert((Subtarget.hasSSE1() || !NumXMMRegs)
3596            && "SSE registers cannot be used when SSE is disabled");
3597
3598     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3599                                         DAG.getConstant(NumXMMRegs, dl,
3600                                                         MVT::i8)));
3601   }
3602
3603   if (isVarArg && IsMustTail) {
3604     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3605     for (const auto &F : Forwards) {
3606       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3607       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3608     }
3609   }
3610
3611   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3612   // don't need this because the eligibility check rejects calls that require
3613   // shuffling arguments passed in memory.
3614   if (!IsSibcall && isTailCall) {
3615     // Force all the incoming stack arguments to be loaded from the stack
3616     // before any new outgoing arguments are stored to the stack, because the
3617     // outgoing stack slots may alias the incoming argument stack slots, and
3618     // the alias isn't otherwise explicit. This is slightly more conservative
3619     // than necessary, because it means that each store effectively depends
3620     // on every argument instead of just those arguments it would clobber.
3621     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3622
3623     SmallVector<SDValue, 8> MemOpChains2;
3624     SDValue FIN;
3625     int FI = 0;
3626     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3627          ++I, ++OutsIndex) {
3628       CCValAssign &VA = ArgLocs[I];
3629
3630       if (VA.isRegLoc()) {
3631         if (VA.needsCustom()) {
3632           assert((CallConv == CallingConv::X86_RegCall) &&
3633                  "Expecting custom case only in regcall calling convention");
3634           // This means that we are in special case where one argument was
3635           // passed through two register locations - Skip the next location
3636           ++I;
3637         }
3638
3639         continue;
3640       }
3641
3642       assert(VA.isMemLoc());
3643       SDValue Arg = OutVals[OutsIndex];
3644       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3645       // Skip inalloca arguments.  They don't require any work.
3646       if (Flags.isInAlloca())
3647         continue;
3648       // Create frame index.
3649       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3650       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3651       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3652       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3653
3654       if (Flags.isByVal()) {
3655         // Copy relative to framepointer.
3656         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3657         if (!StackPtr.getNode())
3658           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3659                                         getPointerTy(DAG.getDataLayout()));
3660         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3661                              StackPtr, Source);
3662
3663         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3664                                                          ArgChain,
3665                                                          Flags, DAG, dl));
3666       } else {
3667         // Store relative to framepointer.
3668         MemOpChains2.push_back(DAG.getStore(
3669             ArgChain, dl, Arg, FIN,
3670             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3671       }
3672     }
3673
3674     if (!MemOpChains2.empty())
3675       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3676
3677     // Store the return address to the appropriate stack slot.
3678     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3679                                      getPointerTy(DAG.getDataLayout()),
3680                                      RegInfo->getSlotSize(), FPDiff, dl);
3681   }
3682
3683   // Build a sequence of copy-to-reg nodes chained together with token chain
3684   // and flag operands which copy the outgoing args into registers.
3685   SDValue InFlag;
3686   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3687     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3688                              RegsToPass[i].second, InFlag);
3689     InFlag = Chain.getValue(1);
3690   }
3691
3692   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3693     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3694     // In the 64-bit large code model, we have to make all calls
3695     // through a register, since the call instruction's 32-bit
3696     // pc-relative offset may not be large enough to hold the whole
3697     // address.
3698   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3699     // If the callee is a GlobalAddress node (quite common, every direct call
3700     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3701     // it.
3702     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3703
3704     // We should use extra load for direct calls to dllimported functions in
3705     // non-JIT mode.
3706     const GlobalValue *GV = G->getGlobal();
3707     if (!GV->hasDLLImportStorageClass()) {
3708       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3709
3710       Callee = DAG.getTargetGlobalAddress(
3711           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3712
3713       if (OpFlags == X86II::MO_GOTPCREL) {
3714         // Add a wrapper.
3715         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3716           getPointerTy(DAG.getDataLayout()), Callee);
3717         // Add extra indirection
3718         Callee = DAG.getLoad(
3719             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3720             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3721       }
3722     }
3723   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3724     const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3725     unsigned char OpFlags =
3726         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3727
3728     Callee = DAG.getTargetExternalSymbol(
3729         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3730   } else if (Subtarget.isTarget64BitILP32() &&
3731              Callee->getValueType(0) == MVT::i32) {
3732     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3733     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3734   }
3735
3736   // Returns a chain & a flag for retval copy to use.
3737   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3738   SmallVector<SDValue, 8> Ops;
3739
3740   if (!IsSibcall && isTailCall) {
3741     Chain = DAG.getCALLSEQ_END(Chain,
3742                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3743                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3744     InFlag = Chain.getValue(1);
3745   }
3746
3747   Ops.push_back(Chain);
3748   Ops.push_back(Callee);
3749
3750   if (isTailCall)
3751     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3752
3753   // Add argument registers to the end of the list so that they are known live
3754   // into the call.
3755   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3756     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3757                                   RegsToPass[i].second.getValueType()));
3758
3759   // Add a register mask operand representing the call-preserved registers.
3760   // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3761   // set X86_INTR calling convention because it has the same CSR mask
3762   // (same preserved registers).
3763   const uint32_t *Mask = RegInfo->getCallPreservedMask(
3764       MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3765   assert(Mask && "Missing call preserved mask for calling convention");
3766
3767   // If this is an invoke in a 32-bit function using a funclet-based
3768   // personality, assume the function clobbers all registers. If an exception
3769   // is thrown, the runtime will not restore CSRs.
3770   // FIXME: Model this more precisely so that we can register allocate across
3771   // the normal edge and spill and fill across the exceptional edge.
3772   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3773     const Function *CallerFn = MF.getFunction();
3774     EHPersonality Pers =
3775         CallerFn->hasPersonalityFn()
3776             ? classifyEHPersonality(CallerFn->getPersonalityFn())
3777             : EHPersonality::Unknown;
3778     if (isFuncletEHPersonality(Pers))
3779       Mask = RegInfo->getNoPreservedMask();
3780   }
3781
3782   // Define a new register mask from the existing mask.
3783   uint32_t *RegMask = nullptr;
3784
3785   // In some calling conventions we need to remove the used physical registers
3786   // from the reg mask.
3787   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3788     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3789
3790     // Allocate a new Reg Mask and copy Mask.
3791     RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3792     unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3793     memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3794
3795     // Make sure all sub registers of the argument registers are reset
3796     // in the RegMask.
3797     for (auto const &RegPair : RegsToPass)
3798       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3799            SubRegs.isValid(); ++SubRegs)
3800         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3801
3802     // Create the RegMask Operand according to our updated mask.
3803     Ops.push_back(DAG.getRegisterMask(RegMask));
3804   } else {
3805     // Create the RegMask Operand according to the static mask.
3806     Ops.push_back(DAG.getRegisterMask(Mask));
3807   }
3808
3809   if (InFlag.getNode())
3810     Ops.push_back(InFlag);
3811
3812   if (isTailCall) {
3813     // We used to do:
3814     //// If this is the first return lowered for this function, add the regs
3815     //// to the liveout set for the function.
3816     // This isn't right, although it's probably harmless on x86; liveouts
3817     // should be computed from returns not tail calls.  Consider a void
3818     // function making a tail call to a function returning int.
3819     MF.getFrameInfo().setHasTailCall();
3820     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3821   }
3822
3823   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3824   InFlag = Chain.getValue(1);
3825
3826   // Create the CALLSEQ_END node.
3827   unsigned NumBytesForCalleeToPop;
3828   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3829                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3830     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3831   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3832            !Subtarget.getTargetTriple().isOSMSVCRT() &&
3833            SR == StackStructReturn)
3834     // If this is a call to a struct-return function, the callee
3835     // pops the hidden struct pointer, so we have to push it back.
3836     // This is common for Darwin/X86, Linux & Mingw32 targets.
3837     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3838     NumBytesForCalleeToPop = 4;
3839   else
3840     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3841
3842   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3843     // No need to reset the stack after the call if the call doesn't return. To
3844     // make the MI verify, we'll pretend the callee does it for us.
3845     NumBytesForCalleeToPop = NumBytes;
3846   }
3847
3848   // Returns a flag for retval copy to use.
3849   if (!IsSibcall) {
3850     Chain = DAG.getCALLSEQ_END(Chain,
3851                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3852                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3853                                                      true),
3854                                InFlag, dl);
3855     InFlag = Chain.getValue(1);
3856   }
3857
3858   // Handle result values, copying them out of physregs into vregs that we
3859   // return.
3860   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3861                          InVals, RegMask);
3862 }
3863
3864 //===----------------------------------------------------------------------===//
3865 //                Fast Calling Convention (tail call) implementation
3866 //===----------------------------------------------------------------------===//
3867
3868 //  Like std call, callee cleans arguments, convention except that ECX is
3869 //  reserved for storing the tail called function address. Only 2 registers are
3870 //  free for argument passing (inreg). Tail call optimization is performed
3871 //  provided:
3872 //                * tailcallopt is enabled
3873 //                * caller/callee are fastcc
3874 //  On X86_64 architecture with GOT-style position independent code only local
3875 //  (within module) calls are supported at the moment.
3876 //  To keep the stack aligned according to platform abi the function
3877 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3878 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3879 //  If a tail called function callee has more arguments than the caller the
3880 //  caller needs to make sure that there is room to move the RETADDR to. This is
3881 //  achieved by reserving an area the size of the argument delta right after the
3882 //  original RETADDR, but before the saved framepointer or the spilled registers
3883 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3884 //  stack layout:
3885 //    arg1
3886 //    arg2
3887 //    RETADDR
3888 //    [ new RETADDR
3889 //      move area ]
3890 //    (possible EBP)
3891 //    ESI
3892 //    EDI
3893 //    local1 ..
3894
3895 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3896 /// requirement.
3897 unsigned
3898 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3899                                                SelectionDAG& DAG) const {
3900   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3901   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3902   unsigned StackAlignment = TFI.getStackAlignment();
3903   uint64_t AlignMask = StackAlignment - 1;
3904   int64_t Offset = StackSize;
3905   unsigned SlotSize = RegInfo->getSlotSize();
3906   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3907     // Number smaller than 12 so just add the difference.
3908     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3909   } else {
3910     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3911     Offset = ((~AlignMask) & Offset) + StackAlignment +
3912       (StackAlignment-SlotSize);
3913   }
3914   return Offset;
3915 }
3916
3917 /// Return true if the given stack call argument is already available in the
3918 /// same position (relatively) of the caller's incoming argument stack.
3919 static
3920 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3921                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3922                          const X86InstrInfo *TII, const CCValAssign &VA) {
3923   unsigned Bytes = Arg.getValueSizeInBits() / 8;
3924
3925   for (;;) {
3926     // Look through nodes that don't alter the bits of the incoming value.
3927     unsigned Op = Arg.getOpcode();
3928     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3929       Arg = Arg.getOperand(0);
3930       continue;
3931     }
3932     if (Op == ISD::TRUNCATE) {
3933       const SDValue &TruncInput = Arg.getOperand(0);
3934       if (TruncInput.getOpcode() == ISD::AssertZext &&
3935           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3936               Arg.getValueType()) {
3937         Arg = TruncInput.getOperand(0);
3938         continue;
3939       }
3940     }
3941     break;
3942   }
3943
3944   int FI = INT_MAX;
3945   if (Arg.getOpcode() == ISD::CopyFromReg) {
3946     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3947     if (!TargetRegisterInfo::isVirtualRegister(VR))
3948       return false;
3949     MachineInstr *Def = MRI->getVRegDef(VR);
3950     if (!Def)
3951       return false;
3952     if (!Flags.isByVal()) {
3953       if (!TII->isLoadFromStackSlot(*Def, FI))
3954         return false;
3955     } else {
3956       unsigned Opcode = Def->getOpcode();
3957       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3958            Opcode == X86::LEA64_32r) &&
3959           Def->getOperand(1).isFI()) {
3960         FI = Def->getOperand(1).getIndex();
3961         Bytes = Flags.getByValSize();
3962       } else
3963         return false;
3964     }
3965   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3966     if (Flags.isByVal())
3967       // ByVal argument is passed in as a pointer but it's now being
3968       // dereferenced. e.g.
3969       // define @foo(%struct.X* %A) {
3970       //   tail call @bar(%struct.X* byval %A)
3971       // }
3972       return false;
3973     SDValue Ptr = Ld->getBasePtr();
3974     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3975     if (!FINode)
3976       return false;
3977     FI = FINode->getIndex();
3978   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3979     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3980     FI = FINode->getIndex();
3981     Bytes = Flags.getByValSize();
3982   } else
3983     return false;
3984
3985   assert(FI != INT_MAX);
3986   if (!MFI.isFixedObjectIndex(FI))
3987     return false;
3988
3989   if (Offset != MFI.getObjectOffset(FI))
3990     return false;
3991
3992   // If this is not byval, check that the argument stack object is immutable.
3993   // inalloca and argument copy elision can create mutable argument stack
3994   // objects. Byval objects can be mutated, but a byval call intends to pass the
3995   // mutated memory.
3996   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3997     return false;
3998
3999   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4000     // If the argument location is wider than the argument type, check that any
4001     // extension flags match.
4002     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4003         Flags.isSExt() != MFI.isObjectSExt(FI)) {
4004       return false;
4005     }
4006   }
4007
4008   return Bytes == MFI.getObjectSize(FI);
4009 }
4010
4011 /// Check whether the call is eligible for tail call optimization. Targets
4012 /// that want to do tail call optimization should implement this function.
4013 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4014     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4015     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4016     const SmallVectorImpl<ISD::OutputArg> &Outs,
4017     const SmallVectorImpl<SDValue> &OutVals,
4018     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4019   if (!mayTailCallThisCC(CalleeCC))
4020     return false;
4021
4022   // If -tailcallopt is specified, make fastcc functions tail-callable.
4023   MachineFunction &MF = DAG.getMachineFunction();
4024   const Function *CallerF = MF.getFunction();
4025
4026   // If the function return type is x86_fp80 and the callee return type is not,
4027   // then the FP_EXTEND of the call result is not a nop. It's not safe to
4028   // perform a tailcall optimization here.
4029   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4030     return false;
4031
4032   CallingConv::ID CallerCC = CallerF->getCallingConv();
4033   bool CCMatch = CallerCC == CalleeCC;
4034   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4035   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4036
4037   // Win64 functions have extra shadow space for argument homing. Don't do the
4038   // sibcall if the caller and callee have mismatched expectations for this
4039   // space.
4040   if (IsCalleeWin64 != IsCallerWin64)
4041     return false;
4042
4043   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4044     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4045       return true;
4046     return false;
4047   }
4048
4049   // Look for obvious safe cases to perform tail call optimization that do not
4050   // require ABI changes. This is what gcc calls sibcall.
4051
4052   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4053   // emit a special epilogue.
4054   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4055   if (RegInfo->needsStackRealignment(MF))
4056     return false;
4057
4058   // Also avoid sibcall optimization if either caller or callee uses struct
4059   // return semantics.
4060   if (isCalleeStructRet || isCallerStructRet)
4061     return false;
4062
4063   // Do not sibcall optimize vararg calls unless all arguments are passed via
4064   // registers.
4065   LLVMContext &C = *DAG.getContext();
4066   if (isVarArg && !Outs.empty()) {
4067     // Optimizing for varargs on Win64 is unlikely to be safe without
4068     // additional testing.
4069     if (IsCalleeWin64 || IsCallerWin64)
4070       return false;
4071
4072     SmallVector<CCValAssign, 16> ArgLocs;
4073     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4074
4075     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4076     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4077       if (!ArgLocs[i].isRegLoc())
4078         return false;
4079   }
4080
4081   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4082   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4083   // this into a sibcall.
4084   bool Unused = false;
4085   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4086     if (!Ins[i].Used) {
4087       Unused = true;
4088       break;
4089     }
4090   }
4091   if (Unused) {
4092     SmallVector<CCValAssign, 16> RVLocs;
4093     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4094     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4095     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4096       CCValAssign &VA = RVLocs[i];
4097       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4098         return false;
4099     }
4100   }
4101
4102   // Check that the call results are passed in the same way.
4103   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4104                                   RetCC_X86, RetCC_X86))
4105     return false;
4106   // The callee has to preserve all registers the caller needs to preserve.
4107   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4108   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4109   if (!CCMatch) {
4110     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4111     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4112       return false;
4113   }
4114
4115   unsigned StackArgsSize = 0;
4116
4117   // If the callee takes no arguments then go on to check the results of the
4118   // call.
4119   if (!Outs.empty()) {
4120     // Check if stack adjustment is needed. For now, do not do this if any
4121     // argument is passed on the stack.
4122     SmallVector<CCValAssign, 16> ArgLocs;
4123     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4124
4125     // Allocate shadow area for Win64
4126     if (IsCalleeWin64)
4127       CCInfo.AllocateStack(32, 8);
4128
4129     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4130     StackArgsSize = CCInfo.getNextStackOffset();
4131
4132     if (CCInfo.getNextStackOffset()) {
4133       // Check if the arguments are already laid out in the right way as
4134       // the caller's fixed stack objects.
4135       MachineFrameInfo &MFI = MF.getFrameInfo();
4136       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4137       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4138       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4139         CCValAssign &VA = ArgLocs[i];
4140         SDValue Arg = OutVals[i];
4141         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4142         if (VA.getLocInfo() == CCValAssign::Indirect)
4143           return false;
4144         if (!VA.isRegLoc()) {
4145           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4146                                    MFI, MRI, TII, VA))
4147             return false;
4148         }
4149       }
4150     }
4151
4152     bool PositionIndependent = isPositionIndependent();
4153     // If the tailcall address may be in a register, then make sure it's
4154     // possible to register allocate for it. In 32-bit, the call address can
4155     // only target EAX, EDX, or ECX since the tail call must be scheduled after
4156     // callee-saved registers are restored. These happen to be the same
4157     // registers used to pass 'inreg' arguments so watch out for those.
4158     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4159                                   !isa<ExternalSymbolSDNode>(Callee)) ||
4160                                  PositionIndependent)) {
4161       unsigned NumInRegs = 0;
4162       // In PIC we need an extra register to formulate the address computation
4163       // for the callee.
4164       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4165
4166       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4167         CCValAssign &VA = ArgLocs[i];
4168         if (!VA.isRegLoc())
4169           continue;
4170         unsigned Reg = VA.getLocReg();
4171         switch (Reg) {
4172         default: break;
4173         case X86::EAX: case X86::EDX: case X86::ECX:
4174           if (++NumInRegs == MaxInRegs)
4175             return false;
4176           break;
4177         }
4178       }
4179     }
4180
4181     const MachineRegisterInfo &MRI = MF.getRegInfo();
4182     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4183       return false;
4184   }
4185
4186   bool CalleeWillPop =
4187       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4188                        MF.getTarget().Options.GuaranteedTailCallOpt);
4189
4190   if (unsigned BytesToPop =
4191           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4192     // If we have bytes to pop, the callee must pop them.
4193     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4194     if (!CalleePopMatches)
4195       return false;
4196   } else if (CalleeWillPop && StackArgsSize > 0) {
4197     // If we don't have bytes to pop, make sure the callee doesn't pop any.
4198     return false;
4199   }
4200
4201   return true;
4202 }
4203
4204 FastISel *
4205 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4206                                   const TargetLibraryInfo *libInfo) const {
4207   return X86::createFastISel(funcInfo, libInfo);
4208 }
4209
4210 //===----------------------------------------------------------------------===//
4211 //                           Other Lowering Hooks
4212 //===----------------------------------------------------------------------===//
4213
4214 static bool MayFoldLoad(SDValue Op) {
4215   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4216 }
4217
4218 static bool MayFoldIntoStore(SDValue Op) {
4219   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4220 }
4221
4222 static bool MayFoldIntoZeroExtend(SDValue Op) {
4223   if (Op.hasOneUse()) {
4224     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4225     return (ISD::ZERO_EXTEND == Opcode);
4226   }
4227   return false;
4228 }
4229
4230 static bool isTargetShuffle(unsigned Opcode) {
4231   switch(Opcode) {
4232   default: return false;
4233   case X86ISD::BLENDI:
4234   case X86ISD::PSHUFB:
4235   case X86ISD::PSHUFD:
4236   case X86ISD::PSHUFHW:
4237   case X86ISD::PSHUFLW:
4238   case X86ISD::SHUFP:
4239   case X86ISD::INSERTPS:
4240   case X86ISD::EXTRQI:
4241   case X86ISD::INSERTQI:
4242   case X86ISD::PALIGNR:
4243   case X86ISD::VSHLDQ:
4244   case X86ISD::VSRLDQ:
4245   case X86ISD::MOVLHPS:
4246   case X86ISD::MOVLHPD:
4247   case X86ISD::MOVHLPS:
4248   case X86ISD::MOVLPS:
4249   case X86ISD::MOVLPD:
4250   case X86ISD::MOVSHDUP:
4251   case X86ISD::MOVSLDUP:
4252   case X86ISD::MOVDDUP:
4253   case X86ISD::MOVSS:
4254   case X86ISD::MOVSD:
4255   case X86ISD::UNPCKL:
4256   case X86ISD::UNPCKH:
4257   case X86ISD::VBROADCAST:
4258   case X86ISD::VPERMILPI:
4259   case X86ISD::VPERMILPV:
4260   case X86ISD::VPERM2X128:
4261   case X86ISD::VPERMIL2:
4262   case X86ISD::VPERMI:
4263   case X86ISD::VPPERM:
4264   case X86ISD::VPERMV:
4265   case X86ISD::VPERMV3:
4266   case X86ISD::VPERMIV3:
4267   case X86ISD::VZEXT_MOVL:
4268     return true;
4269   }
4270 }
4271
4272 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4273   switch (Opcode) {
4274   default: return false;
4275   // Target Shuffles.
4276   case X86ISD::PSHUFB:
4277   case X86ISD::VPERMILPV:
4278   case X86ISD::VPERMIL2:
4279   case X86ISD::VPPERM:
4280   case X86ISD::VPERMV:
4281   case X86ISD::VPERMV3:
4282   case X86ISD::VPERMIV3:
4283     return true;
4284   // 'Faux' Target Shuffles.
4285   case ISD::AND:
4286   case X86ISD::ANDNP:
4287     return true;
4288   }
4289 }
4290
4291 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4292   MachineFunction &MF = DAG.getMachineFunction();
4293   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4294   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4295   int ReturnAddrIndex = FuncInfo->getRAIndex();
4296
4297   if (ReturnAddrIndex == 0) {
4298     // Set up a frame object for the return address.
4299     unsigned SlotSize = RegInfo->getSlotSize();
4300     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4301                                                           -(int64_t)SlotSize,
4302                                                           false);
4303     FuncInfo->setRAIndex(ReturnAddrIndex);
4304   }
4305
4306   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4307 }
4308
4309 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4310                                        bool hasSymbolicDisplacement) {
4311   // Offset should fit into 32 bit immediate field.
4312   if (!isInt<32>(Offset))
4313     return false;
4314
4315   // If we don't have a symbolic displacement - we don't have any extra
4316   // restrictions.
4317   if (!hasSymbolicDisplacement)
4318     return true;
4319
4320   // FIXME: Some tweaks might be needed for medium code model.
4321   if (M != CodeModel::Small && M != CodeModel::Kernel)
4322     return false;
4323
4324   // For small code model we assume that latest object is 16MB before end of 31
4325   // bits boundary. We may also accept pretty large negative constants knowing
4326   // that all objects are in the positive half of address space.
4327   if (M == CodeModel::Small && Offset < 16*1024*1024)
4328     return true;
4329
4330   // For kernel code model we know that all object resist in the negative half
4331   // of 32bits address space. We may not accept negative offsets, since they may
4332   // be just off and we may accept pretty large positive ones.
4333   if (M == CodeModel::Kernel && Offset >= 0)
4334     return true;
4335
4336   return false;
4337 }
4338
4339 /// Determines whether the callee is required to pop its own arguments.
4340 /// Callee pop is necessary to support tail calls.
4341 bool X86::isCalleePop(CallingConv::ID CallingConv,
4342                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4343   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4344   // can guarantee TCO.
4345   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4346     return true;
4347
4348   switch (CallingConv) {
4349   default:
4350     return false;
4351   case CallingConv::X86_StdCall:
4352   case CallingConv::X86_FastCall:
4353   case CallingConv::X86_ThisCall:
4354   case CallingConv::X86_VectorCall:
4355     return !is64Bit;
4356   }
4357 }
4358
4359 /// \brief Return true if the condition is an unsigned comparison operation.
4360 static bool isX86CCUnsigned(unsigned X86CC) {
4361   switch (X86CC) {
4362   default:
4363     llvm_unreachable("Invalid integer condition!");
4364   case X86::COND_E:
4365   case X86::COND_NE:
4366   case X86::COND_B:
4367   case X86::COND_A:
4368   case X86::COND_BE:
4369   case X86::COND_AE:
4370     return true;
4371   case X86::COND_G:
4372   case X86::COND_GE:
4373   case X86::COND_L:
4374   case X86::COND_LE:
4375     return false;
4376   }
4377 }
4378
4379 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4380   switch (SetCCOpcode) {
4381   default: llvm_unreachable("Invalid integer condition!");
4382   case ISD::SETEQ:  return X86::COND_E;
4383   case ISD::SETGT:  return X86::COND_G;
4384   case ISD::SETGE:  return X86::COND_GE;
4385   case ISD::SETLT:  return X86::COND_L;
4386   case ISD::SETLE:  return X86::COND_LE;
4387   case ISD::SETNE:  return X86::COND_NE;
4388   case ISD::SETULT: return X86::COND_B;
4389   case ISD::SETUGT: return X86::COND_A;
4390   case ISD::SETULE: return X86::COND_BE;
4391   case ISD::SETUGE: return X86::COND_AE;
4392   }
4393 }
4394
4395 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4396 /// condition code, returning the condition code and the LHS/RHS of the
4397 /// comparison to make.
4398 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4399                                bool isFP, SDValue &LHS, SDValue &RHS,
4400                                SelectionDAG &DAG) {
4401   if (!isFP) {
4402     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4403       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4404         // X > -1   -> X == 0, jump !sign.
4405         RHS = DAG.getConstant(0, DL, RHS.getValueType());
4406         return X86::COND_NS;
4407       }
4408       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4409         // X < 0   -> X == 0, jump on sign.
4410         return X86::COND_S;
4411       }
4412       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4413         // X < 1   -> X <= 0
4414         RHS = DAG.getConstant(0, DL, RHS.getValueType());
4415         return X86::COND_LE;
4416       }
4417     }
4418
4419     return TranslateIntegerX86CC(SetCCOpcode);
4420   }
4421
4422   // First determine if it is required or is profitable to flip the operands.
4423
4424   // If LHS is a foldable load, but RHS is not, flip the condition.
4425   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4426       !ISD::isNON_EXTLoad(RHS.getNode())) {
4427     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4428     std::swap(LHS, RHS);
4429   }
4430
4431   switch (SetCCOpcode) {
4432   default: break;
4433   case ISD::SETOLT:
4434   case ISD::SETOLE:
4435   case ISD::SETUGT:
4436   case ISD::SETUGE:
4437     std::swap(LHS, RHS);
4438     break;
4439   }
4440
4441   // On a floating point condition, the flags are set as follows:
4442   // ZF  PF  CF   op
4443   //  0 | 0 | 0 | X > Y
4444   //  0 | 0 | 1 | X < Y
4445   //  1 | 0 | 0 | X == Y
4446   //  1 | 1 | 1 | unordered
4447   switch (SetCCOpcode) {
4448   default: llvm_unreachable("Condcode should be pre-legalized away");
4449   case ISD::SETUEQ:
4450   case ISD::SETEQ:   return X86::COND_E;
4451   case ISD::SETOLT:              // flipped
4452   case ISD::SETOGT:
4453   case ISD::SETGT:   return X86::COND_A;
4454   case ISD::SETOLE:              // flipped
4455   case ISD::SETOGE:
4456   case ISD::SETGE:   return X86::COND_AE;
4457   case ISD::SETUGT:              // flipped
4458   case ISD::SETULT:
4459   case ISD::SETLT:   return X86::COND_B;
4460   case ISD::SETUGE:              // flipped
4461   case ISD::SETULE:
4462   case ISD::SETLE:   return X86::COND_BE;
4463   case ISD::SETONE:
4464   case ISD::SETNE:   return X86::COND_NE;
4465   case ISD::SETUO:   return X86::COND_P;
4466   case ISD::SETO:    return X86::COND_NP;
4467   case ISD::SETOEQ:
4468   case ISD::SETUNE:  return X86::COND_INVALID;
4469   }
4470 }
4471
4472 /// Is there a floating point cmov for the specific X86 condition code?
4473 /// Current x86 isa includes the following FP cmov instructions:
4474 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4475 static bool hasFPCMov(unsigned X86CC) {
4476   switch (X86CC) {
4477   default:
4478     return false;
4479   case X86::COND_B:
4480   case X86::COND_BE:
4481   case X86::COND_E:
4482   case X86::COND_P:
4483   case X86::COND_A:
4484   case X86::COND_AE:
4485   case X86::COND_NE:
4486   case X86::COND_NP:
4487     return true;
4488   }
4489 }
4490
4491
4492 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4493                                            const CallInst &I,
4494                                            unsigned Intrinsic) const {
4495
4496   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4497   if (!IntrData)
4498     return false;
4499
4500   Info.opc = ISD::INTRINSIC_W_CHAIN;
4501   Info.readMem = false;
4502   Info.writeMem = false;
4503   Info.vol = false;
4504   Info.offset = 0;
4505
4506   switch (IntrData->Type) {
4507   case EXPAND_FROM_MEM: {
4508     Info.ptrVal = I.getArgOperand(0);
4509     Info.memVT = MVT::getVT(I.getType());
4510     Info.align = 1;
4511     Info.readMem = true;
4512     break;
4513   }
4514   case COMPRESS_TO_MEM: {
4515     Info.ptrVal = I.getArgOperand(0);
4516     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4517     Info.align = 1;
4518     Info.writeMem = true;
4519     break;
4520   }
4521   case TRUNCATE_TO_MEM_VI8:
4522   case TRUNCATE_TO_MEM_VI16:
4523   case TRUNCATE_TO_MEM_VI32: {
4524     Info.ptrVal = I.getArgOperand(0);
4525     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4526     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4527     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4528       ScalarVT = MVT::i8;
4529     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4530       ScalarVT = MVT::i16;
4531     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4532       ScalarVT = MVT::i32;
4533
4534     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4535     Info.align = 1;
4536     Info.writeMem = true;
4537     break;
4538   }
4539   default:
4540     return false;
4541   }
4542
4543   return true;
4544 }
4545
4546 /// Returns true if the target can instruction select the
4547 /// specified FP immediate natively. If false, the legalizer will
4548 /// materialize the FP immediate as a load from a constant pool.
4549 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4550   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4551     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4552       return true;
4553   }
4554   return false;
4555 }
4556
4557 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4558                                               ISD::LoadExtType ExtTy,
4559                                               EVT NewVT) const {
4560   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4561   // relocation target a movq or addq instruction: don't let the load shrink.
4562   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4563   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4564     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4565       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4566   return true;
4567 }
4568
4569 /// \brief Returns true if it is beneficial to convert a load of a constant
4570 /// to just the constant itself.
4571 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4572                                                           Type *Ty) const {
4573   assert(Ty->isIntegerTy());
4574
4575   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4576   if (BitSize == 0 || BitSize > 64)
4577     return false;
4578   return true;
4579 }
4580
4581 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4582                                                 unsigned Index) const {
4583   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4584     return false;
4585
4586   return (Index == 0 || Index == ResVT.getVectorNumElements());
4587 }
4588
4589 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4590   // Speculate cttz only if we can directly use TZCNT.
4591   return Subtarget.hasBMI();
4592 }
4593
4594 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4595   // Speculate ctlz only if we can directly use LZCNT.
4596   return Subtarget.hasLZCNT();
4597 }
4598
4599 bool X86TargetLowering::isCtlzFast() const {
4600   return Subtarget.hasFastLZCNT();
4601 }
4602
4603 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4604     const Instruction &AndI) const {
4605   return true;
4606 }
4607
4608 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4609   if (!Subtarget.hasBMI())
4610     return false;
4611
4612   // There are only 32-bit and 64-bit forms for 'andn'.
4613   EVT VT = Y.getValueType();
4614   if (VT != MVT::i32 && VT != MVT::i64)
4615     return false;
4616
4617   return true;
4618 }
4619
4620 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4621   MVT VT = MVT::getIntegerVT(NumBits);
4622   if (isTypeLegal(VT))
4623     return VT;
4624
4625   // PMOVMSKB can handle this.
4626   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4627     return MVT::v16i8;
4628
4629   // VPMOVMSKB can handle this.
4630   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4631     return MVT::v32i8;
4632
4633   // TODO: Allow 64-bit type for 32-bit target.
4634   // TODO: 512-bit types should be allowed, but make sure that those
4635   // cases are handled in combineVectorSizedSetCCEquality().
4636
4637   return MVT::INVALID_SIMPLE_VALUE_TYPE;
4638 }
4639
4640 /// Val is the undef sentinel value or equal to the specified value.
4641 static bool isUndefOrEqual(int Val, int CmpVal) {
4642   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4643 }
4644
4645 /// Val is either the undef or zero sentinel value.
4646 static bool isUndefOrZero(int Val) {
4647   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4648 }
4649
4650 /// Return true if every element in Mask, beginning
4651 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4652 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4653   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4654     if (Mask[i] != SM_SentinelUndef)
4655       return false;
4656   return true;
4657 }
4658
4659 /// Return true if Val is undef or if its value falls within the
4660 /// specified range (L, H].
4661 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4662   return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4663 }
4664
4665 /// Return true if every element in Mask is undef or if its value
4666 /// falls within the specified range (L, H].
4667 static bool isUndefOrInRange(ArrayRef<int> Mask,
4668                              int Low, int Hi) {
4669   for (int M : Mask)
4670     if (!isUndefOrInRange(M, Low, Hi))
4671       return false;
4672   return true;
4673 }
4674
4675 /// Return true if Val is undef, zero or if its value falls within the
4676 /// specified range (L, H].
4677 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4678   return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4679 }
4680
4681 /// Return true if every element in Mask is undef, zero or if its value
4682 /// falls within the specified range (L, H].
4683 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4684   for (int M : Mask)
4685     if (!isUndefOrZeroOrInRange(M, Low, Hi))
4686       return false;
4687   return true;
4688 }
4689
4690 /// Return true if every element in Mask, beginning
4691 /// from position Pos and ending in Pos+Size, falls within the specified
4692 /// sequential range (Low, Low+Size]. or is undef.
4693 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4694                                        unsigned Pos, unsigned Size, int Low) {
4695   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4696     if (!isUndefOrEqual(Mask[i], Low))
4697       return false;
4698   return true;
4699 }
4700
4701 /// Return true if every element in Mask, beginning
4702 /// from position Pos and ending in Pos+Size, falls within the specified
4703 /// sequential range (Low, Low+Size], or is undef or is zero.
4704 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4705                                              unsigned Size, int Low) {
4706   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4707     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4708       return false;
4709   return true;
4710 }
4711
4712 /// Return true if every element in Mask, beginning
4713 /// from position Pos and ending in Pos+Size is undef or is zero.
4714 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4715                                  unsigned Size) {
4716   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4717     if (!isUndefOrZero(Mask[i]))
4718       return false;
4719   return true;
4720 }
4721
4722 /// \brief Helper function to test whether a shuffle mask could be
4723 /// simplified by widening the elements being shuffled.
4724 ///
4725 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4726 /// leaves it in an unspecified state.
4727 ///
4728 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4729 /// shuffle masks. The latter have the special property of a '-2' representing
4730 /// a zero-ed lane of a vector.
4731 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4732                                     SmallVectorImpl<int> &WidenedMask) {
4733   WidenedMask.assign(Mask.size() / 2, 0);
4734   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4735     int M0 = Mask[i];
4736     int M1 = Mask[i + 1];
4737
4738     // If both elements are undef, its trivial.
4739     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4740       WidenedMask[i / 2] = SM_SentinelUndef;
4741       continue;
4742     }
4743
4744     // Check for an undef mask and a mask value properly aligned to fit with
4745     // a pair of values. If we find such a case, use the non-undef mask's value.
4746     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4747       WidenedMask[i / 2] = M1 / 2;
4748       continue;
4749     }
4750     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4751       WidenedMask[i / 2] = M0 / 2;
4752       continue;
4753     }
4754
4755     // When zeroing, we need to spread the zeroing across both lanes to widen.
4756     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4757       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4758           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4759         WidenedMask[i / 2] = SM_SentinelZero;
4760         continue;
4761       }
4762       return false;
4763     }
4764
4765     // Finally check if the two mask values are adjacent and aligned with
4766     // a pair.
4767     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4768       WidenedMask[i / 2] = M0 / 2;
4769       continue;
4770     }
4771
4772     // Otherwise we can't safely widen the elements used in this shuffle.
4773     return false;
4774   }
4775   assert(WidenedMask.size() == Mask.size() / 2 &&
4776          "Incorrect size of mask after widening the elements!");
4777
4778   return true;
4779 }
4780
4781 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4782 /// mask index with the scaled sequential indices for an equivalent narrowed
4783 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4784 /// succeed.
4785 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4786                              SmallVectorImpl<int> &ScaledMask) {
4787   assert(0 < Scale && "Unexpected scaling factor");
4788   int NumElts = Mask.size();
4789   ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
4790
4791   for (int i = 0; i != NumElts; ++i) {
4792     int M = Mask[i];
4793
4794     // Repeat sentinel values in every mask element.
4795     if (M < 0) {
4796       for (int s = 0; s != Scale; ++s)
4797         ScaledMask[(Scale * i) + s] = M;
4798       continue;
4799     }
4800
4801     // Scale mask element and increment across each mask element.
4802     for (int s = 0; s != Scale; ++s)
4803       ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4804   }
4805 }
4806
4807 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4808 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4809 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4810   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4811   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4812     return false;
4813
4814   // The index should be aligned on a vecWidth-bit boundary.
4815   uint64_t Index = N->getConstantOperandVal(1);
4816   MVT VT = N->getSimpleValueType(0);
4817   unsigned ElSize = VT.getScalarSizeInBits();
4818   return (Index * ElSize) % vecWidth == 0;
4819 }
4820
4821 /// Return true if the specified INSERT_SUBVECTOR
4822 /// operand specifies a subvector insert that is suitable for input to
4823 /// insertion of 128 or 256-bit subvectors
4824 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4825   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4826   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4827     return false;
4828
4829   // The index should be aligned on a vecWidth-bit boundary.
4830   uint64_t Index = N->getConstantOperandVal(2);
4831   MVT VT = N->getSimpleValueType(0);
4832   unsigned ElSize = VT.getScalarSizeInBits();
4833   return (Index * ElSize) % vecWidth == 0;
4834 }
4835
4836 bool X86::isVINSERT128Index(SDNode *N) {
4837   return isVINSERTIndex(N, 128);
4838 }
4839
4840 bool X86::isVINSERT256Index(SDNode *N) {
4841   return isVINSERTIndex(N, 256);
4842 }
4843
4844 bool X86::isVEXTRACT128Index(SDNode *N) {
4845   return isVEXTRACTIndex(N, 128);
4846 }
4847
4848 bool X86::isVEXTRACT256Index(SDNode *N) {
4849   return isVEXTRACTIndex(N, 256);
4850 }
4851
4852 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4853   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4854   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4855          "Illegal extract subvector for VEXTRACT");
4856
4857   uint64_t Index = N->getConstantOperandVal(1);
4858   MVT VecVT = N->getOperand(0).getSimpleValueType();
4859   unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4860   return Index / NumElemsPerChunk;
4861 }
4862
4863 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4864   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4865   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4866          "Illegal insert subvector for VINSERT");
4867
4868   uint64_t Index = N->getConstantOperandVal(2);
4869   MVT VecVT = N->getSimpleValueType(0);
4870   unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4871   return Index / NumElemsPerChunk;
4872 }
4873
4874 /// Return the appropriate immediate to extract the specified
4875 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4876 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4877   return getExtractVEXTRACTImmediate(N, 128);
4878 }
4879
4880 /// Return the appropriate immediate to extract the specified
4881 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4882 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4883   return getExtractVEXTRACTImmediate(N, 256);
4884 }
4885
4886 /// Return the appropriate immediate to insert at the specified
4887 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4888 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4889   return getInsertVINSERTImmediate(N, 128);
4890 }
4891
4892 /// Return the appropriate immediate to insert at the specified
4893 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4894 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4895   return getInsertVINSERTImmediate(N, 256);
4896 }
4897
4898 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4899 bool X86::isZeroNode(SDValue Elt) {
4900   return isNullConstant(Elt) || isNullFPConstant(Elt);
4901 }
4902
4903 // Build a vector of constants.
4904 // Use an UNDEF node if MaskElt == -1.
4905 // Split 64-bit constants in the 32-bit mode.
4906 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4907                               const SDLoc &dl, bool IsMask = false) {
4908
4909   SmallVector<SDValue, 32>  Ops;
4910   bool Split = false;
4911
4912   MVT ConstVecVT = VT;
4913   unsigned NumElts = VT.getVectorNumElements();
4914   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4915   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4916     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4917     Split = true;
4918   }
4919
4920   MVT EltVT = ConstVecVT.getVectorElementType();
4921   for (unsigned i = 0; i < NumElts; ++i) {
4922     bool IsUndef = Values[i] < 0 && IsMask;
4923     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4924       DAG.getConstant(Values[i], dl, EltVT);
4925     Ops.push_back(OpNode);
4926     if (Split)
4927       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4928                     DAG.getConstant(0, dl, EltVT));
4929   }
4930   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4931   if (Split)
4932     ConstsNode = DAG.getBitcast(VT, ConstsNode);
4933   return ConstsNode;
4934 }
4935
4936 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4937                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4938   assert(Bits.size() == Undefs.getBitWidth() &&
4939          "Unequal constant and undef arrays");
4940   SmallVector<SDValue, 32> Ops;
4941   bool Split = false;
4942
4943   MVT ConstVecVT = VT;
4944   unsigned NumElts = VT.getVectorNumElements();
4945   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4946   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4947     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4948     Split = true;
4949   }
4950
4951   MVT EltVT = ConstVecVT.getVectorElementType();
4952   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4953     if (Undefs[i]) {
4954       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4955       continue;
4956     }
4957     const APInt &V = Bits[i];
4958     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4959     if (Split) {
4960       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4961       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4962     } else if (EltVT == MVT::f32) {
4963       APFloat FV(APFloat::IEEEsingle(), V);
4964       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4965     } else if (EltVT == MVT::f64) {
4966       APFloat FV(APFloat::IEEEdouble(), V);
4967       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4968     } else {
4969       Ops.push_back(DAG.getConstant(V, dl, EltVT));
4970     }
4971   }
4972
4973   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4974   return DAG.getBitcast(VT, ConstsNode);
4975 }
4976
4977 /// Returns a vector of specified type with all zero elements.
4978 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4979                              SelectionDAG &DAG, const SDLoc &dl) {
4980   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4981           VT.getVectorElementType() == MVT::i1) &&
4982          "Unexpected vector type");
4983
4984   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4985   // type. This ensures they get CSE'd. But if the integer type is not
4986   // available, use a floating-point +0.0 instead.
4987   SDValue Vec;
4988   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4989     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4990   } else if (VT.getVectorElementType() == MVT::i1) {
4991     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4992            "Unexpected vector type");
4993     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4994            "Unexpected vector type");
4995     Vec = DAG.getConstant(0, dl, VT);
4996   } else {
4997     unsigned Num32BitElts = VT.getSizeInBits() / 32;
4998     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4999   }
5000   return DAG.getBitcast(VT, Vec);
5001 }
5002
5003 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5004                                 const SDLoc &dl, unsigned vectorWidth) {
5005   EVT VT = Vec.getValueType();
5006   EVT ElVT = VT.getVectorElementType();
5007   unsigned Factor = VT.getSizeInBits()/vectorWidth;
5008   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5009                                   VT.getVectorNumElements()/Factor);
5010
5011   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
5012   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5013   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5014
5015   // This is the index of the first element of the vectorWidth-bit chunk
5016   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5017   IdxVal &= ~(ElemsPerChunk - 1);
5018
5019   // If the input is a buildvector just emit a smaller one.
5020   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5021     return DAG.getBuildVector(
5022         ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5023
5024   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5025   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5026 }
5027
5028 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
5029 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5030 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5031 /// instructions or a simple subregister reference. Idx is an index in the
5032 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5033 /// lowering EXTRACT_VECTOR_ELT operations easier.
5034 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5035                                    SelectionDAG &DAG, const SDLoc &dl) {
5036   assert((Vec.getValueType().is256BitVector() ||
5037           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5038   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5039 }
5040
5041 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5042 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5043                                    SelectionDAG &DAG, const SDLoc &dl) {
5044   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5045   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5046 }
5047
5048 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5049                                SelectionDAG &DAG, const SDLoc &dl,
5050                                unsigned vectorWidth) {
5051   assert((vectorWidth == 128 || vectorWidth == 256) &&
5052          "Unsupported vector width");
5053   // Inserting UNDEF is Result
5054   if (Vec.isUndef())
5055     return Result;
5056   EVT VT = Vec.getValueType();
5057   EVT ElVT = VT.getVectorElementType();
5058   EVT ResultVT = Result.getValueType();
5059
5060   // Insert the relevant vectorWidth bits.
5061   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5062   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5063
5064   // This is the index of the first element of the vectorWidth-bit chunk
5065   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5066   IdxVal &= ~(ElemsPerChunk - 1);
5067
5068   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5069   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5070 }
5071
5072 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5073 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5074 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5075 /// simple superregister reference.  Idx is an index in the 128 bits
5076 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
5077 /// lowering INSERT_VECTOR_ELT operations easier.
5078 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5079                                   SelectionDAG &DAG, const SDLoc &dl) {
5080   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5081   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5082 }
5083
5084 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5085                                   SelectionDAG &DAG, const SDLoc &dl) {
5086   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5087   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5088 }
5089
5090 // Return true if the instruction zeroes the unused upper part of the
5091 // destination and accepts mask.
5092 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5093   switch (Opcode) {
5094   default:
5095     return false;
5096   case X86ISD::PCMPEQM:
5097   case X86ISD::PCMPGTM:
5098   case X86ISD::CMPM:
5099   case X86ISD::CMPMU:
5100     return true;
5101   }
5102 }
5103
5104 /// Insert i1-subvector to i1-vector.
5105 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5106                                 const X86Subtarget &Subtarget) {
5107
5108   SDLoc dl(Op);
5109   SDValue Vec = Op.getOperand(0);
5110   SDValue SubVec = Op.getOperand(1);
5111   SDValue Idx = Op.getOperand(2);
5112
5113   if (!isa<ConstantSDNode>(Idx))
5114     return SDValue();
5115
5116   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5117   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
5118     return Op;
5119
5120   MVT OpVT = Op.getSimpleValueType();
5121   MVT SubVecVT = SubVec.getSimpleValueType();
5122   unsigned NumElems = OpVT.getVectorNumElements();
5123   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5124
5125   assert(IdxVal + SubVecNumElems <= NumElems &&
5126          IdxVal % SubVecVT.getSizeInBits() == 0 &&
5127          "Unexpected index value in INSERT_SUBVECTOR");
5128
5129   // There are 3 possible cases:
5130   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5131   // 2. Subvector should be inserted in the upper part
5132   //    (IdxVal + SubVecNumElems == NumElems)
5133   // 3. Subvector should be inserted in the middle (for example v2i1
5134   //    to v16i1, index 2)
5135
5136   // If this node widens - by concatenating zeroes - the type of the result
5137   // of a node with instruction that zeroes all upper (irrelevant) bits of the
5138   // output register, mark this node as legal to enable replacing them with
5139   // the v8i1 version of the previous instruction during instruction selection.
5140   // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5141   // while zeroing all the upper remaining 60 bits of the register. if the
5142   // result of such instruction is inserted into an allZeroVector, then we can
5143   // safely remove insert_vector (in instruction selection) as the cmp instr
5144   // already zeroed the rest of the register.
5145   if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5146       (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5147        (SubVec.getOpcode() == ISD::AND &&
5148         (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5149          isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5150     return Op;
5151
5152   // extend to natively supported kshift
5153   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5154   MVT WideOpVT = OpVT;
5155   if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5156     WideOpVT = MinVT;
5157
5158   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5159   SDValue Undef = DAG.getUNDEF(WideOpVT);
5160   SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5161                                    Undef, SubVec, ZeroIdx);
5162
5163   // Extract sub-vector if require.
5164   auto ExtractSubVec = [&](SDValue V) {
5165     return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5166                                                 OpVT, V, ZeroIdx);
5167   };
5168
5169   if (Vec.isUndef()) {
5170     if (IdxVal != 0) {
5171       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5172       WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5173                                ShiftBits);
5174     }
5175     return ExtractSubVec(WideSubVec);
5176   }
5177
5178   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5179     NumElems = WideOpVT.getVectorNumElements();
5180     unsigned ShiftLeft = NumElems - SubVecNumElems;
5181     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5182     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5183                       DAG.getConstant(ShiftLeft, dl, MVT::i8));
5184     Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5185       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5186     return ExtractSubVec(Vec);
5187   }
5188
5189   if (IdxVal == 0) {
5190     // Zero lower bits of the Vec
5191     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5192     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5193     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5194     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5195     // Merge them together, SubVec should be zero extended.
5196     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5197                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
5198                              SubVec, ZeroIdx);
5199     Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5200     return ExtractSubVec(Vec);
5201   }
5202
5203   // Simple case when we put subvector in the upper part
5204   if (IdxVal + SubVecNumElems == NumElems) {
5205     // Zero upper bits of the Vec
5206     WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5207                              DAG.getConstant(IdxVal, dl, MVT::i8));
5208     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5209     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5210     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5211     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5212     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5213     return ExtractSubVec(Vec);
5214   }
5215   // Subvector should be inserted in the middle - use shuffle
5216   WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5217                            SubVec, ZeroIdx);
5218   SmallVector<int, 64> Mask;
5219   for (unsigned i = 0; i < NumElems; ++i)
5220     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5221                     i : i + NumElems);
5222   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5223 }
5224
5225 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5226 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5227 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5228 /// large BUILD_VECTORS.
5229 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5230                                    unsigned NumElems, SelectionDAG &DAG,
5231                                    const SDLoc &dl) {
5232   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5233   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5234 }
5235
5236 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5237                                    unsigned NumElems, SelectionDAG &DAG,
5238                                    const SDLoc &dl) {
5239   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5240   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5241 }
5242
5243 /// Returns a vector of specified type with all bits set.
5244 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5245 /// Then bitcast to their original type, ensuring they get CSE'd.
5246 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5247   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5248          "Expected a 128/256/512-bit vector type");
5249
5250   APInt Ones = APInt::getAllOnesValue(32);
5251   unsigned NumElts = VT.getSizeInBits() / 32;
5252   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5253   return DAG.getBitcast(VT, Vec);
5254 }
5255
5256 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5257                               SelectionDAG &DAG) {
5258   EVT InVT = In.getValueType();
5259   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5260
5261   if (VT.is128BitVector() && InVT.is128BitVector())
5262     return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5263                                 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5264
5265   // For 256-bit vectors, we only need the lower (128-bit) input half.
5266   // For 512-bit vectors, we only need the lower input half or quarter.
5267   if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5268     int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5269     In = extractSubVector(In, 0, DAG, DL,
5270                           std::max(128, (int)VT.getSizeInBits() / Scale));
5271   }
5272
5273   return DAG.getNode(Opc, DL, VT, In);
5274 }
5275
5276 /// Generate unpacklo/unpackhi shuffle mask.
5277 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5278                                     bool Unary) {
5279   assert(Mask.empty() && "Expected an empty shuffle mask vector");
5280   int NumElts = VT.getVectorNumElements();
5281   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5282
5283   for (int i = 0; i < NumElts; ++i) {
5284     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5285     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5286     Pos += (Unary ? 0 : NumElts * (i % 2));
5287     Pos += (Lo ? 0 : NumEltsInLane / 2);
5288     Mask.push_back(Pos);
5289   }
5290 }
5291
5292 /// Returns a vector_shuffle node for an unpackl operation.
5293 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5294                           SDValue V1, SDValue V2) {
5295   SmallVector<int, 8> Mask;
5296   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5297   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5298 }
5299
5300 /// Returns a vector_shuffle node for an unpackh operation.
5301 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5302                           SDValue V1, SDValue V2) {
5303   SmallVector<int, 8> Mask;
5304   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5305   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5306 }
5307
5308 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5309 /// This produces a shuffle where the low element of V2 is swizzled into the
5310 /// zero/undef vector, landing at element Idx.
5311 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5312 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5313                                            bool IsZero,
5314                                            const X86Subtarget &Subtarget,
5315                                            SelectionDAG &DAG) {
5316   MVT VT = V2.getSimpleValueType();
5317   SDValue V1 = IsZero
5318     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5319   int NumElems = VT.getVectorNumElements();
5320   SmallVector<int, 16> MaskVec(NumElems);
5321   for (int i = 0; i != NumElems; ++i)
5322     // If this is the insertion idx, put the low elt of V2 here.
5323     MaskVec[i] = (i == Idx) ? NumElems : i;
5324   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5325 }
5326
5327 static SDValue peekThroughBitcasts(SDValue V) {
5328   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5329     V = V.getOperand(0);
5330   return V;
5331 }
5332
5333 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5334   while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5335          V.getOperand(0).hasOneUse())
5336     V = V.getOperand(0);
5337   return V;
5338 }
5339
5340 static const Constant *getTargetConstantFromNode(SDValue Op) {
5341   Op = peekThroughBitcasts(Op);
5342
5343   auto *Load = dyn_cast<LoadSDNode>(Op);
5344   if (!Load)
5345     return nullptr;
5346
5347   SDValue Ptr = Load->getBasePtr();
5348   if (Ptr->getOpcode() == X86ISD::Wrapper ||
5349       Ptr->getOpcode() == X86ISD::WrapperRIP)
5350     Ptr = Ptr->getOperand(0);
5351
5352   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5353   if (!CNode || CNode->isMachineConstantPoolEntry())
5354     return nullptr;
5355
5356   return dyn_cast<Constant>(CNode->getConstVal());
5357 }
5358
5359 // Extract raw constant bits from constant pools.
5360 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5361                                           APInt &UndefElts,
5362                                           SmallVectorImpl<APInt> &EltBits,
5363                                           bool AllowWholeUndefs = true,
5364                                           bool AllowPartialUndefs = true) {
5365   assert(EltBits.empty() && "Expected an empty EltBits vector");
5366
5367   Op = peekThroughBitcasts(Op);
5368
5369   EVT VT = Op.getValueType();
5370   unsigned SizeInBits = VT.getSizeInBits();
5371   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5372   unsigned NumElts = SizeInBits / EltSizeInBits;
5373
5374   // Bitcast a source array of element bits to the target size.
5375   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5376     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5377     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5378     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5379            "Constant bit sizes don't match");
5380
5381     // Don't split if we don't allow undef bits.
5382     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5383     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5384       return false;
5385
5386     // If we're already the right size, don't bother bitcasting.
5387     if (NumSrcElts == NumElts) {
5388       UndefElts = UndefSrcElts;
5389       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5390       return true;
5391     }
5392
5393     // Extract all the undef/constant element data and pack into single bitsets.
5394     APInt UndefBits(SizeInBits, 0);
5395     APInt MaskBits(SizeInBits, 0);
5396
5397     for (unsigned i = 0; i != NumSrcElts; ++i) {
5398       unsigned BitOffset = i * SrcEltSizeInBits;
5399       if (UndefSrcElts[i])
5400         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5401       MaskBits.insertBits(SrcEltBits[i], BitOffset);
5402     }
5403
5404     // Split the undef/constant single bitset data into the target elements.
5405     UndefElts = APInt(NumElts, 0);
5406     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5407
5408     for (unsigned i = 0; i != NumElts; ++i) {
5409       unsigned BitOffset = i * EltSizeInBits;
5410       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5411
5412       // Only treat an element as UNDEF if all bits are UNDEF.
5413       if (UndefEltBits.isAllOnesValue()) {
5414         if (!AllowWholeUndefs)
5415           return false;
5416         UndefElts.setBit(i);
5417         continue;
5418       }
5419
5420       // If only some bits are UNDEF then treat them as zero (or bail if not
5421       // supported).
5422       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5423         return false;
5424
5425       APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5426       EltBits[i] = Bits.getZExtValue();
5427     }
5428     return true;
5429   };
5430
5431   // Collect constant bits and insert into mask/undef bit masks.
5432   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5433                                 unsigned UndefBitIndex) {
5434     if (!Cst)
5435       return false;
5436     if (isa<UndefValue>(Cst)) {
5437       Undefs.setBit(UndefBitIndex);
5438       return true;
5439     }
5440     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5441       Mask = CInt->getValue();
5442       return true;
5443     }
5444     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5445       Mask = CFP->getValueAPF().bitcastToAPInt();
5446       return true;
5447     }
5448     return false;
5449   };
5450
5451   // Extract constant bits from build vector.
5452   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5453     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5454     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5455
5456     APInt UndefSrcElts(NumSrcElts, 0);
5457     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5458     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5459       const SDValue &Src = Op.getOperand(i);
5460       if (Src.isUndef()) {
5461         UndefSrcElts.setBit(i);
5462         continue;
5463       }
5464       auto *Cst = cast<ConstantSDNode>(Src);
5465       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5466     }
5467     return CastBitData(UndefSrcElts, SrcEltBits);
5468   }
5469
5470   // Extract constant bits from constant pool vector.
5471   if (auto *Cst = getTargetConstantFromNode(Op)) {
5472     Type *CstTy = Cst->getType();
5473     if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5474       return false;
5475
5476     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5477     unsigned NumSrcElts = CstTy->getVectorNumElements();
5478
5479     APInt UndefSrcElts(NumSrcElts, 0);
5480     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5481     for (unsigned i = 0; i != NumSrcElts; ++i)
5482       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5483                                UndefSrcElts, i))
5484         return false;
5485
5486     return CastBitData(UndefSrcElts, SrcEltBits);
5487   }
5488
5489   // Extract constant bits from a broadcasted constant pool scalar.
5490   if (Op.getOpcode() == X86ISD::VBROADCAST &&
5491       EltSizeInBits <= VT.getScalarSizeInBits()) {
5492     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5493       unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5494       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5495
5496       APInt UndefSrcElts(NumSrcElts, 0);
5497       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5498       if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5499         if (UndefSrcElts[0])
5500           UndefSrcElts.setBits(0, NumSrcElts);
5501         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5502         return CastBitData(UndefSrcElts, SrcEltBits);
5503       }
5504     }
5505   }
5506
5507   // Extract a rematerialized scalar constant insertion.
5508   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5509       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5510       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5511     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5512     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5513
5514     APInt UndefSrcElts(NumSrcElts, 0);
5515     SmallVector<APInt, 64> SrcEltBits;
5516     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5517     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5518     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5519     return CastBitData(UndefSrcElts, SrcEltBits);
5520   }
5521
5522   return false;
5523 }
5524
5525 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5526                                         unsigned MaskEltSizeInBits,
5527                                         SmallVectorImpl<uint64_t> &RawMask) {
5528   APInt UndefElts;
5529   SmallVector<APInt, 64> EltBits;
5530
5531   // Extract the raw target constant bits.
5532   // FIXME: We currently don't support UNDEF bits or mask entries.
5533   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5534                                      EltBits, /* AllowWholeUndefs */ false,
5535                                      /* AllowPartialUndefs */ false))
5536     return false;
5537
5538   // Insert the extracted elements into the mask.
5539   for (APInt Elt : EltBits)
5540     RawMask.push_back(Elt.getZExtValue());
5541
5542   return true;
5543 }
5544
5545 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5546 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5547 /// operands in \p Ops, and returns true.
5548 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5549 /// IsUnary for shuffles which use a single input multiple times, and in those
5550 /// cases it will adjust the mask to only have indices within that single input.
5551 /// It is an error to call this with non-empty Mask/Ops vectors.
5552 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5553                                  SmallVectorImpl<SDValue> &Ops,
5554                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5555   unsigned NumElems = VT.getVectorNumElements();
5556   SDValue ImmN;
5557
5558   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5559   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5560
5561   IsUnary = false;
5562   bool IsFakeUnary = false;
5563   switch(N->getOpcode()) {
5564   case X86ISD::BLENDI:
5565     ImmN = N->getOperand(N->getNumOperands()-1);
5566     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5567     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5568     break;
5569   case X86ISD::SHUFP:
5570     ImmN = N->getOperand(N->getNumOperands()-1);
5571     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5572     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5573     break;
5574   case X86ISD::INSERTPS:
5575     ImmN = N->getOperand(N->getNumOperands()-1);
5576     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5577     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5578     break;
5579   case X86ISD::EXTRQI:
5580     if (isa<ConstantSDNode>(N->getOperand(1)) &&
5581         isa<ConstantSDNode>(N->getOperand(2))) {
5582       int BitLen = N->getConstantOperandVal(1);
5583       int BitIdx = N->getConstantOperandVal(2);
5584       DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5585       IsUnary = true;
5586     }
5587     break;
5588   case X86ISD::INSERTQI:
5589     if (isa<ConstantSDNode>(N->getOperand(2)) &&
5590         isa<ConstantSDNode>(N->getOperand(3))) {
5591       int BitLen = N->getConstantOperandVal(2);
5592       int BitIdx = N->getConstantOperandVal(3);
5593       DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5594       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5595     }
5596     break;
5597   case X86ISD::UNPCKH:
5598     DecodeUNPCKHMask(VT, Mask);
5599     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5600     break;
5601   case X86ISD::UNPCKL:
5602     DecodeUNPCKLMask(VT, Mask);
5603     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5604     break;
5605   case X86ISD::MOVHLPS:
5606     DecodeMOVHLPSMask(NumElems, Mask);
5607     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5608     break;
5609   case X86ISD::MOVLHPS:
5610     DecodeMOVLHPSMask(NumElems, Mask);
5611     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5612     break;
5613   case X86ISD::PALIGNR:
5614     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5615     ImmN = N->getOperand(N->getNumOperands()-1);
5616     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5617     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5618     Ops.push_back(N->getOperand(1));
5619     Ops.push_back(N->getOperand(0));
5620     break;
5621   case X86ISD::VSHLDQ:
5622     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5623     ImmN = N->getOperand(N->getNumOperands() - 1);
5624     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5625     IsUnary = true;
5626     break;
5627   case X86ISD::VSRLDQ:
5628     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5629     ImmN = N->getOperand(N->getNumOperands() - 1);
5630     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5631     IsUnary = true;
5632     break;
5633   case X86ISD::PSHUFD:
5634   case X86ISD::VPERMILPI:
5635     ImmN = N->getOperand(N->getNumOperands()-1);
5636     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5637     IsUnary = true;
5638     break;
5639   case X86ISD::PSHUFHW:
5640     ImmN = N->getOperand(N->getNumOperands()-1);
5641     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5642     IsUnary = true;
5643     break;
5644   case X86ISD::PSHUFLW:
5645     ImmN = N->getOperand(N->getNumOperands()-1);
5646     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5647     IsUnary = true;
5648     break;
5649   case X86ISD::VZEXT_MOVL:
5650     DecodeZeroMoveLowMask(VT, Mask);
5651     IsUnary = true;
5652     break;
5653   case X86ISD::VBROADCAST: {
5654     SDValue N0 = N->getOperand(0);
5655     // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5656     // add the pre-extracted value to the Ops vector.
5657     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5658         N0.getOperand(0).getValueType() == VT &&
5659         N0.getConstantOperandVal(1) == 0)
5660       Ops.push_back(N0.getOperand(0));
5661
5662     // We only decode broadcasts of same-sized vectors, unless the broadcast
5663     // came from an extract from the original width. If we found one, we
5664     // pushed it the Ops vector above.
5665     if (N0.getValueType() == VT || !Ops.empty()) {
5666       DecodeVectorBroadcast(VT, Mask);
5667       IsUnary = true;
5668       break;
5669     }
5670     return false;
5671   }
5672   case X86ISD::VPERMILPV: {
5673     IsUnary = true;
5674     SDValue MaskNode = N->getOperand(1);
5675     unsigned MaskEltSize = VT.getScalarSizeInBits();
5676     SmallVector<uint64_t, 32> RawMask;
5677     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5678       DecodeVPERMILPMask(VT, RawMask, Mask);
5679       break;
5680     }
5681     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5682       DecodeVPERMILPMask(C, MaskEltSize, Mask);
5683       break;
5684     }
5685     return false;
5686   }
5687   case X86ISD::PSHUFB: {
5688     IsUnary = true;
5689     SDValue MaskNode = N->getOperand(1);
5690     SmallVector<uint64_t, 32> RawMask;
5691     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5692       DecodePSHUFBMask(RawMask, Mask);
5693       break;
5694     }
5695     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5696       DecodePSHUFBMask(C, Mask);
5697       break;
5698     }
5699     return false;
5700   }
5701   case X86ISD::VPERMI:
5702     ImmN = N->getOperand(N->getNumOperands()-1);
5703     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5704     IsUnary = true;
5705     break;
5706   case X86ISD::MOVSS:
5707   case X86ISD::MOVSD:
5708     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5709     break;
5710   case X86ISD::VPERM2X128:
5711     ImmN = N->getOperand(N->getNumOperands()-1);
5712     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5713     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5714     break;
5715   case X86ISD::MOVSLDUP:
5716     DecodeMOVSLDUPMask(VT, Mask);
5717     IsUnary = true;
5718     break;
5719   case X86ISD::MOVSHDUP:
5720     DecodeMOVSHDUPMask(VT, Mask);
5721     IsUnary = true;
5722     break;
5723   case X86ISD::MOVDDUP:
5724     DecodeMOVDDUPMask(VT, Mask);
5725     IsUnary = true;
5726     break;
5727   case X86ISD::MOVLHPD:
5728   case X86ISD::MOVLPD:
5729   case X86ISD::MOVLPS:
5730     // Not yet implemented
5731     return false;
5732   case X86ISD::VPERMIL2: {
5733     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5734     unsigned MaskEltSize = VT.getScalarSizeInBits();
5735     SDValue MaskNode = N->getOperand(2);
5736     SDValue CtrlNode = N->getOperand(3);
5737     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5738       unsigned CtrlImm = CtrlOp->getZExtValue();
5739       SmallVector<uint64_t, 32> RawMask;
5740       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5741         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5742         break;
5743       }
5744       if (auto *C = getTargetConstantFromNode(MaskNode)) {
5745         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5746         break;
5747       }
5748     }
5749     return false;
5750   }
5751   case X86ISD::VPPERM: {
5752     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5753     SDValue MaskNode = N->getOperand(2);
5754     SmallVector<uint64_t, 32> RawMask;
5755     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5756       DecodeVPPERMMask(RawMask, Mask);
5757       break;
5758     }
5759     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5760       DecodeVPPERMMask(C, Mask);
5761       break;
5762     }
5763     return false;
5764   }
5765   case X86ISD::VPERMV: {
5766     IsUnary = true;
5767     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5768     Ops.push_back(N->getOperand(1));
5769     SDValue MaskNode = N->getOperand(0);
5770     SmallVector<uint64_t, 32> RawMask;
5771     unsigned MaskEltSize = VT.getScalarSizeInBits();
5772     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5773       DecodeVPERMVMask(RawMask, Mask);
5774       break;
5775     }
5776     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5777       DecodeVPERMVMask(C, MaskEltSize, Mask);
5778       break;
5779     }
5780     return false;
5781   }
5782   case X86ISD::VPERMV3: {
5783     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5784     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5785     Ops.push_back(N->getOperand(0));
5786     Ops.push_back(N->getOperand(2));
5787     SDValue MaskNode = N->getOperand(1);
5788     unsigned MaskEltSize = VT.getScalarSizeInBits();
5789     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5790       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5791       break;
5792     }
5793     return false;
5794   }
5795   case X86ISD::VPERMIV3: {
5796     IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5797     // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5798     Ops.push_back(N->getOperand(1));
5799     Ops.push_back(N->getOperand(2));
5800     SDValue MaskNode = N->getOperand(0);
5801     unsigned MaskEltSize = VT.getScalarSizeInBits();
5802     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5803       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5804       break;
5805     }
5806     return false;
5807   }
5808   default: llvm_unreachable("unknown target shuffle node");
5809   }
5810
5811   // Empty mask indicates the decode failed.
5812   if (Mask.empty())
5813     return false;
5814
5815   // Check if we're getting a shuffle mask with zero'd elements.
5816   if (!AllowSentinelZero)
5817     if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5818       return false;
5819
5820   // If we have a fake unary shuffle, the shuffle mask is spread across two
5821   // inputs that are actually the same node. Re-map the mask to always point
5822   // into the first input.
5823   if (IsFakeUnary)
5824     for (int &M : Mask)
5825       if (M >= (int)Mask.size())
5826         M -= Mask.size();
5827
5828   // If we didn't already add operands in the opcode-specific code, default to
5829   // adding 1 or 2 operands starting at 0.
5830   if (Ops.empty()) {
5831     Ops.push_back(N->getOperand(0));
5832     if (!IsUnary || IsFakeUnary)
5833       Ops.push_back(N->getOperand(1));
5834   }
5835
5836   return true;
5837 }
5838
5839 /// Check a target shuffle mask's inputs to see if we can set any values to
5840 /// SM_SentinelZero - this is for elements that are known to be zero
5841 /// (not just zeroable) from their inputs.
5842 /// Returns true if the target shuffle mask was decoded.
5843 static bool setTargetShuffleZeroElements(SDValue N,
5844                                          SmallVectorImpl<int> &Mask,
5845                                          SmallVectorImpl<SDValue> &Ops) {
5846   bool IsUnary;
5847   if (!isTargetShuffle(N.getOpcode()))
5848     return false;
5849
5850   MVT VT = N.getSimpleValueType();
5851   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5852     return false;
5853
5854   SDValue V1 = Ops[0];
5855   SDValue V2 = IsUnary ? V1 : Ops[1];
5856
5857   V1 = peekThroughBitcasts(V1);
5858   V2 = peekThroughBitcasts(V2);
5859
5860   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5861          "Illegal split of shuffle value type");
5862   unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5863
5864   // Extract known constant input data.
5865   APInt UndefSrcElts[2];
5866   SmallVector<APInt, 32> SrcEltBits[2];
5867   bool IsSrcConstant[2] = {
5868       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5869                                     SrcEltBits[0], true, false),
5870       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5871                                     SrcEltBits[1], true, false)};
5872
5873   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5874     int M = Mask[i];
5875
5876     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5877     if (M < 0)
5878       continue;
5879
5880     // Determine shuffle input and normalize the mask.
5881     unsigned SrcIdx = M / Size;
5882     SDValue V = M < Size ? V1 : V2;
5883     M %= Size;
5884
5885     // We are referencing an UNDEF input.
5886     if (V.isUndef()) {
5887       Mask[i] = SM_SentinelUndef;
5888       continue;
5889     }
5890
5891     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5892     // TODO: We currently only set UNDEF for integer types - floats use the same
5893     // registers as vectors and many of the scalar folded loads rely on the
5894     // SCALAR_TO_VECTOR pattern.
5895     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5896         (Size % V.getValueType().getVectorNumElements()) == 0) {
5897       int Scale = Size / V.getValueType().getVectorNumElements();
5898       int Idx = M / Scale;
5899       if (Idx != 0 && !VT.isFloatingPoint())
5900         Mask[i] = SM_SentinelUndef;
5901       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5902         Mask[i] = SM_SentinelZero;
5903       continue;
5904     }
5905
5906     // Attempt to extract from the source's constant bits.
5907     if (IsSrcConstant[SrcIdx]) {
5908       if (UndefSrcElts[SrcIdx][M])
5909         Mask[i] = SM_SentinelUndef;
5910       else if (SrcEltBits[SrcIdx][M] == 0)
5911         Mask[i] = SM_SentinelZero;
5912     }
5913   }
5914
5915   assert(VT.getVectorNumElements() == Mask.size() &&
5916          "Different mask size from vector size!");
5917   return true;
5918 }
5919
5920 // Attempt to decode ops that could be represented as a shuffle mask.
5921 // The decoded shuffle mask may contain a different number of elements to the
5922 // destination value type.
5923 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5924                                SmallVectorImpl<SDValue> &Ops,
5925                                SelectionDAG &DAG) {
5926   Mask.clear();
5927   Ops.clear();
5928
5929   MVT VT = N.getSimpleValueType();
5930   unsigned NumElts = VT.getVectorNumElements();
5931   unsigned NumSizeInBits = VT.getSizeInBits();
5932   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5933   assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5934          "Expected byte aligned value types");
5935
5936   unsigned Opcode = N.getOpcode();
5937   switch (Opcode) {
5938   case ISD::AND:
5939   case X86ISD::ANDNP: {
5940     // Attempt to decode as a per-byte mask.
5941     APInt UndefElts;
5942     SmallVector<APInt, 32> EltBits;
5943     SDValue N0 = N.getOperand(0);
5944     SDValue N1 = N.getOperand(1);
5945     bool IsAndN = (X86ISD::ANDNP == Opcode);
5946     uint64_t ZeroMask = IsAndN ? 255 : 0;
5947     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5948       return false;
5949     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5950       if (UndefElts[i]) {
5951         Mask.push_back(SM_SentinelUndef);
5952         continue;
5953       }
5954       uint64_t ByteBits = EltBits[i].getZExtValue();
5955       if (ByteBits != 0 && ByteBits != 255)
5956         return false;
5957       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5958     }
5959     Ops.push_back(IsAndN ? N1 : N0);
5960     return true;
5961   }
5962   case ISD::SCALAR_TO_VECTOR: {
5963     // Match against a scalar_to_vector of an extract from a vector,
5964     // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5965     SDValue N0 = N.getOperand(0);
5966     SDValue SrcExtract;
5967
5968     if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5969         N0.getOperand(0).getValueType() == VT) {
5970       SrcExtract = N0;
5971     } else if (N0.getOpcode() == ISD::AssertZext &&
5972                N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5973                cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
5974       SrcExtract = N0.getOperand(0);
5975       assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
5976     } else if (N0.getOpcode() == ISD::AssertZext &&
5977                N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5978                cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
5979       SrcExtract = N0.getOperand(0);
5980       assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
5981     }
5982
5983     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5984       return false;
5985
5986     SDValue SrcVec = SrcExtract.getOperand(0);
5987     EVT SrcVT = SrcVec.getValueType();
5988     unsigned NumSrcElts = SrcVT.getVectorNumElements();
5989     unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5990
5991     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5992     if (NumSrcElts <= SrcIdx)
5993       return false;
5994
5995     Ops.push_back(SrcVec);
5996     Mask.push_back(SrcIdx);
5997     Mask.append(NumZeros, SM_SentinelZero);
5998     Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5999     return true;
6000   }
6001   case X86ISD::PINSRB:
6002   case X86ISD::PINSRW: {
6003     SDValue InVec = N.getOperand(0);
6004     SDValue InScl = N.getOperand(1);
6005     uint64_t InIdx = N.getConstantOperandVal(2);
6006     assert(InIdx < NumElts && "Illegal insertion index");
6007
6008     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6009     if (X86::isZeroNode(InScl)) {
6010       Ops.push_back(InVec);
6011       for (unsigned i = 0; i != NumElts; ++i)
6012         Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6013       return true;
6014     }
6015
6016     // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
6017     // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6018     unsigned ExOp =
6019         (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6020     if (InScl.getOpcode() != ISD::AssertZext ||
6021         InScl.getOperand(0).getOpcode() != ExOp)
6022       return false;
6023
6024     SDValue ExVec = InScl.getOperand(0).getOperand(0);
6025     uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
6026     assert(ExIdx < NumElts && "Illegal extraction index");
6027     Ops.push_back(InVec);
6028     Ops.push_back(ExVec);
6029     for (unsigned i = 0; i != NumElts; ++i)
6030       Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6031     return true;
6032   }
6033   case X86ISD::PACKSS: {
6034     // If we know input saturation won't happen we can treat this
6035     // as a truncation shuffle.
6036     if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
6037         DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
6038       return false;
6039
6040     Ops.push_back(N.getOperand(0));
6041     Ops.push_back(N.getOperand(1));
6042     for (unsigned i = 0; i != NumElts; ++i)
6043       Mask.push_back(i * 2);
6044     return true;
6045   }
6046   case X86ISD::VSHLI:
6047   case X86ISD::VSRLI: {
6048     uint64_t ShiftVal = N.getConstantOperandVal(1);
6049     // Out of range bit shifts are guaranteed to be zero.
6050     if (NumBitsPerElt <= ShiftVal) {
6051       Mask.append(NumElts, SM_SentinelZero);
6052       return true;
6053     }
6054
6055     // We can only decode 'whole byte' bit shifts as shuffles.
6056     if ((ShiftVal % 8) != 0)
6057       break;
6058
6059     uint64_t ByteShift = ShiftVal / 8;
6060     unsigned NumBytes = NumSizeInBits / 8;
6061     unsigned NumBytesPerElt = NumBitsPerElt / 8;
6062     Ops.push_back(N.getOperand(0));
6063
6064     // Clear mask to all zeros and insert the shifted byte indices.
6065     Mask.append(NumBytes, SM_SentinelZero);
6066
6067     if (X86ISD::VSHLI == Opcode) {
6068       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6069         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6070           Mask[i + j] = i + j - ByteShift;
6071     } else {
6072       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6073         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6074           Mask[i + j - ByteShift] = i + j;
6075     }
6076     return true;
6077   }
6078   case ISD::ZERO_EXTEND_VECTOR_INREG:
6079   case X86ISD::VZEXT: {
6080     // TODO - add support for VPMOVZX with smaller input vector types.
6081     SDValue Src = N.getOperand(0);
6082     MVT SrcVT = Src.getSimpleValueType();
6083     if (NumSizeInBits != SrcVT.getSizeInBits())
6084       break;
6085     DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6086     Ops.push_back(Src);
6087     return true;
6088   }
6089   }
6090
6091   return false;
6092 }
6093
6094 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6095 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6096                                               SmallVectorImpl<int> &Mask) {
6097   int MaskWidth = Mask.size();
6098   SmallVector<SDValue, 16> UsedInputs;
6099   for (int i = 0, e = Inputs.size(); i < e; ++i) {
6100     int lo = UsedInputs.size() * MaskWidth;
6101     int hi = lo + MaskWidth;
6102     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6103       UsedInputs.push_back(Inputs[i]);
6104       continue;
6105     }
6106     for (int &M : Mask)
6107       if (lo <= M)
6108         M -= MaskWidth;
6109   }
6110   Inputs = UsedInputs;
6111 }
6112
6113 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6114 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6115 /// remaining input indices in case we now have a unary shuffle and adjust the
6116 /// inputs accordingly.
6117 /// Returns true if the target shuffle mask was decoded.
6118 static bool resolveTargetShuffleInputs(SDValue Op,
6119                                        SmallVectorImpl<SDValue> &Inputs,
6120                                        SmallVectorImpl<int> &Mask,
6121                                        SelectionDAG &DAG) {
6122   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6123     if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6124       return false;
6125
6126   resolveTargetShuffleInputsAndMask(Inputs, Mask);
6127   return true;
6128 }
6129
6130 /// Returns the scalar element that will make up the ith
6131 /// element of the result of the vector shuffle.
6132 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6133                                    unsigned Depth) {
6134   if (Depth == 6)
6135     return SDValue();  // Limit search depth.
6136
6137   SDValue V = SDValue(N, 0);
6138   EVT VT = V.getValueType();
6139   unsigned Opcode = V.getOpcode();
6140
6141   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6142   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6143     int Elt = SV->getMaskElt(Index);
6144
6145     if (Elt < 0)
6146       return DAG.getUNDEF(VT.getVectorElementType());
6147
6148     unsigned NumElems = VT.getVectorNumElements();
6149     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6150                                          : SV->getOperand(1);
6151     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6152   }
6153
6154   // Recurse into target specific vector shuffles to find scalars.
6155   if (isTargetShuffle(Opcode)) {
6156     MVT ShufVT = V.getSimpleValueType();
6157     MVT ShufSVT = ShufVT.getVectorElementType();
6158     int NumElems = (int)ShufVT.getVectorNumElements();
6159     SmallVector<int, 16> ShuffleMask;
6160     SmallVector<SDValue, 16> ShuffleOps;
6161     bool IsUnary;
6162
6163     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6164       return SDValue();
6165
6166     int Elt = ShuffleMask[Index];
6167     if (Elt == SM_SentinelZero)
6168       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6169                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6170     if (Elt == SM_SentinelUndef)
6171       return DAG.getUNDEF(ShufSVT);
6172
6173     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6174     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6175     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6176                                Depth+1);
6177   }
6178
6179   // Actual nodes that may contain scalar elements
6180   if (Opcode == ISD::BITCAST) {
6181     V = V.getOperand(0);
6182     EVT SrcVT = V.getValueType();
6183     unsigned NumElems = VT.getVectorNumElements();
6184
6185     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6186       return SDValue();
6187   }
6188
6189   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6190     return (Index == 0) ? V.getOperand(0)
6191                         : DAG.getUNDEF(VT.getVectorElementType());
6192
6193   if (V.getOpcode() == ISD::BUILD_VECTOR)
6194     return V.getOperand(Index);
6195
6196   return SDValue();
6197 }
6198
6199 /// Custom lower build_vector of v16i8.
6200 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6201                                      unsigned NumNonZero, unsigned NumZero,
6202                                      SelectionDAG &DAG,
6203                                      const X86Subtarget &Subtarget) {
6204   if (NumNonZero > 8 && !Subtarget.hasSSE41())
6205     return SDValue();
6206
6207   SDLoc dl(Op);
6208   SDValue V;
6209   bool First = true;
6210
6211   // SSE4.1 - use PINSRB to insert each byte directly.
6212   if (Subtarget.hasSSE41()) {
6213     for (unsigned i = 0; i < 16; ++i) {
6214       bool IsNonZero = (NonZeros & (1 << i)) != 0;
6215       if (IsNonZero) {
6216         // If the build vector contains zeros or our first insertion is not the
6217         // first index then insert into zero vector to break any register
6218         // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6219         if (First) {
6220           First = false;
6221           if (NumZero || 0 != i)
6222             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6223           else {
6224             assert(0 == i && "Expected insertion into zero-index");
6225             V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6226             V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6227             V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6228             V = DAG.getBitcast(MVT::v16i8, V);
6229             continue;
6230           }
6231         }
6232         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6233                         Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6234       }
6235     }
6236
6237     return V;
6238   }
6239
6240   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6241   for (unsigned i = 0; i < 16; ++i) {
6242     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6243     if (ThisIsNonZero && First) {
6244       if (NumZero)
6245         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6246       else
6247         V = DAG.getUNDEF(MVT::v8i16);
6248       First = false;
6249     }
6250
6251     if ((i & 1) != 0) {
6252       // FIXME: Investigate extending to i32 instead of just i16.
6253       // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6254       SDValue ThisElt, LastElt;
6255       bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6256       if (LastIsNonZero) {
6257         LastElt =
6258             DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6259       }
6260       if (ThisIsNonZero) {
6261         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6262         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6263                               DAG.getConstant(8, dl, MVT::i8));
6264         if (LastIsNonZero)
6265           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6266       } else
6267         ThisElt = LastElt;
6268
6269       if (ThisElt) {
6270         if (1 == i) {
6271           V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6272                       : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6273           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6274           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6275           V = DAG.getBitcast(MVT::v8i16, V);
6276         } else {
6277           V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6278                           DAG.getIntPtrConstant(i / 2, dl));
6279         }
6280       }
6281     }
6282   }
6283
6284   return DAG.getBitcast(MVT::v16i8, V);
6285 }
6286
6287 /// Custom lower build_vector of v8i16.
6288 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6289                                      unsigned NumNonZero, unsigned NumZero,
6290                                      SelectionDAG &DAG,
6291                                      const X86Subtarget &Subtarget) {
6292   if (NumNonZero > 4 && !Subtarget.hasSSE41())
6293     return SDValue();
6294
6295   SDLoc dl(Op);
6296   SDValue V;
6297   bool First = true;
6298   for (unsigned i = 0; i < 8; ++i) {
6299     bool IsNonZero = (NonZeros & (1 << i)) != 0;
6300     if (IsNonZero) {
6301       // If the build vector contains zeros or our first insertion is not the
6302       // first index then insert into zero vector to break any register
6303       // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6304       if (First) {
6305         First = false;
6306         if (NumZero || 0 != i)
6307           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6308         else {
6309           assert(0 == i && "Expected insertion into zero-index");
6310           V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6311           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6312           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6313           V = DAG.getBitcast(MVT::v8i16, V);
6314           continue;
6315         }
6316       }
6317       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6318                       Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6319     }
6320   }
6321
6322   return V;
6323 }
6324
6325 /// Custom lower build_vector of v4i32 or v4f32.
6326 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6327                                      const X86Subtarget &Subtarget) {
6328   // Find all zeroable elements.
6329   std::bitset<4> Zeroable;
6330   for (int i=0; i < 4; ++i) {
6331     SDValue Elt = Op->getOperand(i);
6332     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6333   }
6334   assert(Zeroable.size() - Zeroable.count() > 1 &&
6335          "We expect at least two non-zero elements!");
6336
6337   // We only know how to deal with build_vector nodes where elements are either
6338   // zeroable or extract_vector_elt with constant index.
6339   SDValue FirstNonZero;
6340   unsigned FirstNonZeroIdx;
6341   for (unsigned i=0; i < 4; ++i) {
6342     if (Zeroable[i])
6343       continue;
6344     SDValue Elt = Op->getOperand(i);
6345     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6346         !isa<ConstantSDNode>(Elt.getOperand(1)))
6347       return SDValue();
6348     // Make sure that this node is extracting from a 128-bit vector.
6349     MVT VT = Elt.getOperand(0).getSimpleValueType();
6350     if (!VT.is128BitVector())
6351       return SDValue();
6352     if (!FirstNonZero.getNode()) {
6353       FirstNonZero = Elt;
6354       FirstNonZeroIdx = i;
6355     }
6356   }
6357
6358   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6359   SDValue V1 = FirstNonZero.getOperand(0);
6360   MVT VT = V1.getSimpleValueType();
6361
6362   // See if this build_vector can be lowered as a blend with zero.
6363   SDValue Elt;
6364   unsigned EltMaskIdx, EltIdx;
6365   int Mask[4];
6366   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6367     if (Zeroable[EltIdx]) {
6368       // The zero vector will be on the right hand side.
6369       Mask[EltIdx] = EltIdx+4;
6370       continue;
6371     }
6372
6373     Elt = Op->getOperand(EltIdx);
6374     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6375     EltMaskIdx = Elt.getConstantOperandVal(1);
6376     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6377       break;
6378     Mask[EltIdx] = EltIdx;
6379   }
6380
6381   if (EltIdx == 4) {
6382     // Let the shuffle legalizer deal with blend operations.
6383     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6384     if (V1.getSimpleValueType() != VT)
6385       V1 = DAG.getBitcast(VT, V1);
6386     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6387   }
6388
6389   // See if we can lower this build_vector to a INSERTPS.
6390   if (!Subtarget.hasSSE41())
6391     return SDValue();
6392
6393   SDValue V2 = Elt.getOperand(0);
6394   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6395     V1 = SDValue();
6396
6397   bool CanFold = true;
6398   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6399     if (Zeroable[i])
6400       continue;
6401
6402     SDValue Current = Op->getOperand(i);
6403     SDValue SrcVector = Current->getOperand(0);
6404     if (!V1.getNode())
6405       V1 = SrcVector;
6406     CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6407   }
6408
6409   if (!CanFold)
6410     return SDValue();
6411
6412   assert(V1.getNode() && "Expected at least two non-zero elements!");
6413   if (V1.getSimpleValueType() != MVT::v4f32)
6414     V1 = DAG.getBitcast(MVT::v4f32, V1);
6415   if (V2.getSimpleValueType() != MVT::v4f32)
6416     V2 = DAG.getBitcast(MVT::v4f32, V2);
6417
6418   // Ok, we can emit an INSERTPS instruction.
6419   unsigned ZMask = Zeroable.to_ulong();
6420
6421   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6422   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6423   SDLoc DL(Op);
6424   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6425                                DAG.getIntPtrConstant(InsertPSMask, DL));
6426   return DAG.getBitcast(VT, Result);
6427 }
6428
6429 /// Return a vector logical shift node.
6430 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6431                          SelectionDAG &DAG, const TargetLowering &TLI,
6432                          const SDLoc &dl) {
6433   assert(VT.is128BitVector() && "Unknown type for VShift");
6434   MVT ShVT = MVT::v16i8;
6435   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6436   SrcOp = DAG.getBitcast(ShVT, SrcOp);
6437   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6438   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6439   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6440   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6441 }
6442
6443 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6444                                       SelectionDAG &DAG) {
6445
6446   // Check if the scalar load can be widened into a vector load. And if
6447   // the address is "base + cst" see if the cst can be "absorbed" into
6448   // the shuffle mask.
6449   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6450     SDValue Ptr = LD->getBasePtr();
6451     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6452       return SDValue();
6453     EVT PVT = LD->getValueType(0);
6454     if (PVT != MVT::i32 && PVT != MVT::f32)
6455       return SDValue();
6456
6457     int FI = -1;
6458     int64_t Offset = 0;
6459     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6460       FI = FINode->getIndex();
6461       Offset = 0;
6462     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6463                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6464       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6465       Offset = Ptr.getConstantOperandVal(1);
6466       Ptr = Ptr.getOperand(0);
6467     } else {
6468       return SDValue();
6469     }
6470
6471     // FIXME: 256-bit vector instructions don't require a strict alignment,
6472     // improve this code to support it better.
6473     unsigned RequiredAlign = VT.getSizeInBits()/8;
6474     SDValue Chain = LD->getChain();
6475     // Make sure the stack object alignment is at least 16 or 32.
6476     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6477     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6478       if (MFI.isFixedObjectIndex(FI)) {
6479         // Can't change the alignment. FIXME: It's possible to compute
6480         // the exact stack offset and reference FI + adjust offset instead.
6481         // If someone *really* cares about this. That's the way to implement it.
6482         return SDValue();
6483       } else {
6484         MFI.setObjectAlignment(FI, RequiredAlign);
6485       }
6486     }
6487
6488     // (Offset % 16 or 32) must be multiple of 4. Then address is then
6489     // Ptr + (Offset & ~15).
6490     if (Offset < 0)
6491       return SDValue();
6492     if ((Offset % RequiredAlign) & 3)
6493       return SDValue();
6494     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6495     if (StartOffset) {
6496       SDLoc DL(Ptr);
6497       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6498                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6499     }
6500
6501     int EltNo = (Offset - StartOffset) >> 2;
6502     unsigned NumElems = VT.getVectorNumElements();
6503
6504     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6505     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6506                              LD->getPointerInfo().getWithOffset(StartOffset));
6507
6508     SmallVector<int, 8> Mask(NumElems, EltNo);
6509
6510     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6511   }
6512
6513   return SDValue();
6514 }
6515
6516 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6517 /// elements can be replaced by a single large load which has the same value as
6518 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6519 ///
6520 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6521 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6522                                         const SDLoc &DL, SelectionDAG &DAG,
6523                                         const X86Subtarget &Subtarget,
6524                                         bool isAfterLegalize) {
6525   unsigned NumElems = Elts.size();
6526
6527   int LastLoadedElt = -1;
6528   SmallBitVector LoadMask(NumElems, false);
6529   SmallBitVector ZeroMask(NumElems, false);
6530   SmallBitVector UndefMask(NumElems, false);
6531
6532   // For each element in the initializer, see if we've found a load, zero or an
6533   // undef.
6534   for (unsigned i = 0; i < NumElems; ++i) {
6535     SDValue Elt = peekThroughBitcasts(Elts[i]);
6536     if (!Elt.getNode())
6537       return SDValue();
6538
6539     if (Elt.isUndef())
6540       UndefMask[i] = true;
6541     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6542       ZeroMask[i] = true;
6543     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6544       LoadMask[i] = true;
6545       LastLoadedElt = i;
6546       // Each loaded element must be the correct fractional portion of the
6547       // requested vector load.
6548       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6549         return SDValue();
6550     } else
6551       return SDValue();
6552   }
6553   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6554          "Incomplete element masks");
6555
6556   // Handle Special Cases - all undef or undef/zero.
6557   if (UndefMask.count() == NumElems)
6558     return DAG.getUNDEF(VT);
6559
6560   // FIXME: Should we return this as a BUILD_VECTOR instead?
6561   if ((ZeroMask | UndefMask).count() == NumElems)
6562     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6563                           : DAG.getConstantFP(0.0, DL, VT);
6564
6565   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6566   int FirstLoadedElt = LoadMask.find_first();
6567   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6568   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6569   EVT LDBaseVT = EltBase.getValueType();
6570
6571   // Consecutive loads can contain UNDEFS but not ZERO elements.
6572   // Consecutive loads with UNDEFs and ZEROs elements require a
6573   // an additional shuffle stage to clear the ZERO elements.
6574   bool IsConsecutiveLoad = true;
6575   bool IsConsecutiveLoadWithZeros = true;
6576   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6577     if (LoadMask[i]) {
6578       SDValue Elt = peekThroughBitcasts(Elts[i]);
6579       LoadSDNode *LD = cast<LoadSDNode>(Elt);
6580       if (!DAG.areNonVolatileConsecutiveLoads(
6581               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6582               i - FirstLoadedElt)) {
6583         IsConsecutiveLoad = false;
6584         IsConsecutiveLoadWithZeros = false;
6585         break;
6586       }
6587     } else if (ZeroMask[i]) {
6588       IsConsecutiveLoad = false;
6589     }
6590   }
6591
6592   auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6593     auto MMOFlags = LDBase->getMemOperand()->getFlags();
6594     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6595            "Cannot merge volatile loads.");
6596     SDValue NewLd =
6597         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6598                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6599     DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6600     return NewLd;
6601   };
6602
6603   // LOAD - all consecutive load/undefs (must start/end with a load).
6604   // If we have found an entire vector of loads and undefs, then return a large
6605   // load of the entire vector width starting at the base pointer.
6606   // If the vector contains zeros, then attempt to shuffle those elements.
6607   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6608       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6609     assert(LDBase && "Did not find base load for merging consecutive loads");
6610     EVT EltVT = LDBase->getValueType(0);
6611     // Ensure that the input vector size for the merged loads matches the
6612     // cumulative size of the input elements.
6613     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6614       return SDValue();
6615
6616     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6617       return SDValue();
6618
6619     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6620     // will lower to regular temporal loads and use the cache.
6621     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6622         VT.is256BitVector() && !Subtarget.hasInt256())
6623       return SDValue();
6624
6625     if (IsConsecutiveLoad)
6626       return CreateLoad(VT, LDBase);
6627
6628     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6629     // vector and a zero vector to clear out the zero elements.
6630     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6631       SmallVector<int, 4> ClearMask(NumElems, -1);
6632       for (unsigned i = 0; i < NumElems; ++i) {
6633         if (ZeroMask[i])
6634           ClearMask[i] = i + NumElems;
6635         else if (LoadMask[i])
6636           ClearMask[i] = i;
6637       }
6638       SDValue V = CreateLoad(VT, LDBase);
6639       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6640                                  : DAG.getConstantFP(0.0, DL, VT);
6641       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6642     }
6643   }
6644
6645   int LoadSize =
6646       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6647
6648   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6649   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6650       (LoadSize == 32 || LoadSize == 64) &&
6651       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6652     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6653                                       : MVT::getIntegerVT(LoadSize);
6654     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6655     if (TLI.isTypeLegal(VecVT)) {
6656       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6657       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6658       SDValue ResNode =
6659           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6660                                   LDBase->getPointerInfo(),
6661                                   LDBase->getAlignment(),
6662                                   false/*isVolatile*/, true/*ReadMem*/,
6663                                   false/*WriteMem*/);
6664       DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6665       return DAG.getBitcast(VT, ResNode);
6666     }
6667   }
6668
6669   return SDValue();
6670 }
6671
6672 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6673                                    unsigned SplatBitSize, LLVMContext &C) {
6674   unsigned ScalarSize = VT.getScalarSizeInBits();
6675   unsigned NumElm = SplatBitSize / ScalarSize;
6676
6677   SmallVector<Constant *, 32> ConstantVec;
6678   for (unsigned i = 0; i < NumElm; i++) {
6679     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6680     Constant *Const;
6681     if (VT.isFloatingPoint()) {
6682       if (ScalarSize == 32) {
6683         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6684       } else {
6685         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6686         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6687       }
6688     } else
6689       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6690     ConstantVec.push_back(Const);
6691   }
6692   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6693 }
6694
6695 static bool isUseOfShuffle(SDNode *N) {
6696   for (auto *U : N->uses()) {
6697     if (isTargetShuffle(U->getOpcode()))
6698       return true;
6699     if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6700       return isUseOfShuffle(U);
6701   }
6702   return false;
6703 }
6704
6705 /// Attempt to use the vbroadcast instruction to generate a splat value
6706 /// from a splat BUILD_VECTOR which uses:
6707 ///  a. A single scalar load, or a constant.
6708 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6709 ///
6710 /// The VBROADCAST node is returned when a pattern is found,
6711 /// or SDValue() otherwise.
6712 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6713                                            const X86Subtarget &Subtarget,
6714                                            SelectionDAG &DAG) {
6715   // VBROADCAST requires AVX.
6716   // TODO: Splats could be generated for non-AVX CPUs using SSE
6717   // instructions, but there's less potential gain for only 128-bit vectors.
6718   if (!Subtarget.hasAVX())
6719     return SDValue();
6720
6721   MVT VT = BVOp->getSimpleValueType(0);
6722   SDLoc dl(BVOp);
6723
6724   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6725          "Unsupported vector type for broadcast.");
6726
6727   BitVector UndefElements;
6728   SDValue Ld = BVOp->getSplatValue(&UndefElements);
6729
6730   // We need a splat of a single value to use broadcast, and it doesn't
6731   // make any sense if the value is only in one element of the vector.
6732   if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6733     APInt SplatValue, Undef;
6734     unsigned SplatBitSize;
6735     bool HasUndef;
6736     // Check if this is a repeated constant pattern suitable for broadcasting.
6737     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6738         SplatBitSize > VT.getScalarSizeInBits() &&
6739         SplatBitSize < VT.getSizeInBits()) {
6740       // Avoid replacing with broadcast when it's a use of a shuffle
6741       // instruction to preserve the present custom lowering of shuffles.
6742       if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6743         return SDValue();
6744       // replace BUILD_VECTOR with broadcast of the repeated constants.
6745       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6746       LLVMContext *Ctx = DAG.getContext();
6747       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6748       if (Subtarget.hasAVX()) {
6749         if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6750             !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6751           // Splatted value can fit in one INTEGER constant in constant pool.
6752           // Load the constant and broadcast it.
6753           MVT CVT = MVT::getIntegerVT(SplatBitSize);
6754           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6755           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6756           SDValue CP = DAG.getConstantPool(C, PVT);
6757           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6758
6759           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6760           Ld = DAG.getLoad(
6761               CVT, dl, DAG.getEntryNode(), CP,
6762               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6763               Alignment);
6764           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6765                                        MVT::getVectorVT(CVT, Repeat), Ld);
6766           return DAG.getBitcast(VT, Brdcst);
6767         } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6768           // Splatted value can fit in one FLOAT constant in constant pool.
6769           // Load the constant and broadcast it.
6770           // AVX have support for 32 and 64 bit broadcast for floats only.
6771           // No 64bit integer in 32bit subtarget.
6772           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6773           // Lower the splat via APFloat directly, to avoid any conversion.
6774           Constant *C =
6775               SplatBitSize == 32
6776                   ? ConstantFP::get(*Ctx,
6777                                     APFloat(APFloat::IEEEsingle(), SplatValue))
6778                   : ConstantFP::get(*Ctx,
6779                                     APFloat(APFloat::IEEEdouble(), SplatValue));
6780           SDValue CP = DAG.getConstantPool(C, PVT);
6781           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6782
6783           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6784           Ld = DAG.getLoad(
6785               CVT, dl, DAG.getEntryNode(), CP,
6786               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6787               Alignment);
6788           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6789                                        MVT::getVectorVT(CVT, Repeat), Ld);
6790           return DAG.getBitcast(VT, Brdcst);
6791         } else if (SplatBitSize > 64) {
6792           // Load the vector of constants and broadcast it.
6793           MVT CVT = VT.getScalarType();
6794           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6795                                              *Ctx);
6796           SDValue VCP = DAG.getConstantPool(VecC, PVT);
6797           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6798           unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6799           Ld = DAG.getLoad(
6800               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6801               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6802               Alignment);
6803           SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6804           return DAG.getBitcast(VT, Brdcst);
6805         }
6806       }
6807     }
6808     return SDValue();
6809   }
6810
6811   bool ConstSplatVal =
6812       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6813
6814   // Make sure that all of the users of a non-constant load are from the
6815   // BUILD_VECTOR node.
6816   if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6817     return SDValue();
6818
6819   unsigned ScalarSize = Ld.getValueSizeInBits();
6820   bool IsGE256 = (VT.getSizeInBits() >= 256);
6821
6822   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6823   // instruction to save 8 or more bytes of constant pool data.
6824   // TODO: If multiple splats are generated to load the same constant,
6825   // it may be detrimental to overall size. There needs to be a way to detect
6826   // that condition to know if this is truly a size win.
6827   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6828
6829   // Handle broadcasting a single constant scalar from the constant pool
6830   // into a vector.
6831   // On Sandybridge (no AVX2), it is still better to load a constant vector
6832   // from the constant pool and not to broadcast it from a scalar.
6833   // But override that restriction when optimizing for size.
6834   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6835   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6836     EVT CVT = Ld.getValueType();
6837     assert(!CVT.isVector() && "Must not broadcast a vector type");
6838
6839     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6840     // For size optimization, also splat v2f64 and v2i64, and for size opt
6841     // with AVX2, also splat i8 and i16.
6842     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6843     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6844         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6845       const Constant *C = nullptr;
6846       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6847         C = CI->getConstantIntValue();
6848       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6849         C = CF->getConstantFPValue();
6850
6851       assert(C && "Invalid constant type");
6852
6853       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6854       SDValue CP =
6855           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6856       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6857       Ld = DAG.getLoad(
6858           CVT, dl, DAG.getEntryNode(), CP,
6859           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6860           Alignment);
6861
6862       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6863     }
6864   }
6865
6866   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6867
6868   // Handle AVX2 in-register broadcasts.
6869   if (!IsLoad && Subtarget.hasInt256() &&
6870       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6871     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6872
6873   // The scalar source must be a normal load.
6874   if (!IsLoad)
6875     return SDValue();
6876
6877   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6878       (Subtarget.hasVLX() && ScalarSize == 64))
6879     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6880
6881   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6882   // double since there is no vbroadcastsd xmm
6883   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6884     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6885       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6886   }
6887
6888   // Unsupported broadcast.
6889   return SDValue();
6890 }
6891
6892 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6893 /// underlying vector and index.
6894 ///
6895 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6896 /// index.
6897 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6898                                          SDValue ExtIdx) {
6899   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6900   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6901     return Idx;
6902
6903   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6904   // lowered this:
6905   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6906   // to:
6907   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6908   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6909   //                           undef)
6910   //                       Constant<0>)
6911   // In this case the vector is the extract_subvector expression and the index
6912   // is 2, as specified by the shuffle.
6913   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6914   SDValue ShuffleVec = SVOp->getOperand(0);
6915   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6916   assert(ShuffleVecVT.getVectorElementType() ==
6917          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6918
6919   int ShuffleIdx = SVOp->getMaskElt(Idx);
6920   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6921     ExtractedFromVec = ShuffleVec;
6922     return ShuffleIdx;
6923   }
6924   return Idx;
6925 }
6926
6927 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6928   MVT VT = Op.getSimpleValueType();
6929
6930   // Skip if insert_vec_elt is not supported.
6931   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6932   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6933     return SDValue();
6934
6935   SDLoc DL(Op);
6936   unsigned NumElems = Op.getNumOperands();
6937
6938   SDValue VecIn1;
6939   SDValue VecIn2;
6940   SmallVector<unsigned, 4> InsertIndices;
6941   SmallVector<int, 8> Mask(NumElems, -1);
6942
6943   for (unsigned i = 0; i != NumElems; ++i) {
6944     unsigned Opc = Op.getOperand(i).getOpcode();
6945
6946     if (Opc == ISD::UNDEF)
6947       continue;
6948
6949     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6950       // Quit if more than 1 elements need inserting.
6951       if (InsertIndices.size() > 1)
6952         return SDValue();
6953
6954       InsertIndices.push_back(i);
6955       continue;
6956     }
6957
6958     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6959     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6960
6961     // Quit if non-constant index.
6962     if (!isa<ConstantSDNode>(ExtIdx))
6963       return SDValue();
6964     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6965
6966     // Quit if extracted from vector of different type.
6967     if (ExtractedFromVec.getValueType() != VT)
6968       return SDValue();
6969
6970     if (!VecIn1.getNode())
6971       VecIn1 = ExtractedFromVec;
6972     else if (VecIn1 != ExtractedFromVec) {
6973       if (!VecIn2.getNode())
6974         VecIn2 = ExtractedFromVec;
6975       else if (VecIn2 != ExtractedFromVec)
6976         // Quit if more than 2 vectors to shuffle
6977         return SDValue();
6978     }
6979
6980     if (ExtractedFromVec == VecIn1)
6981       Mask[i] = Idx;
6982     else if (ExtractedFromVec == VecIn2)
6983       Mask[i] = Idx + NumElems;
6984   }
6985
6986   if (!VecIn1.getNode())
6987     return SDValue();
6988
6989   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6990   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6991
6992   for (unsigned Idx : InsertIndices)
6993     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6994                      DAG.getIntPtrConstant(Idx, DL));
6995
6996   return NV;
6997 }
6998
6999 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7000   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7001          Op.getScalarValueSizeInBits() == 1 &&
7002          "Can not convert non-constant vector");
7003   uint64_t Immediate = 0;
7004   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7005     SDValue In = Op.getOperand(idx);
7006     if (!In.isUndef())
7007       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7008   }
7009   SDLoc dl(Op);
7010   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7011   return DAG.getConstant(Immediate, dl, VT);
7012 }
7013 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7014 SDValue
7015 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7016
7017   MVT VT = Op.getSimpleValueType();
7018   assert((VT.getVectorElementType() == MVT::i1) &&
7019          "Unexpected type in LowerBUILD_VECTORvXi1!");
7020
7021   SDLoc dl(Op);
7022   if (ISD::isBuildVectorAllZeros(Op.getNode()))
7023     return DAG.getTargetConstant(0, dl, VT);
7024
7025   if (ISD::isBuildVectorAllOnes(Op.getNode()))
7026     return DAG.getTargetConstant(1, dl, VT);
7027
7028   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7029     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7030     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7031       return DAG.getBitcast(VT, Imm);
7032     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7033     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7034                         DAG.getIntPtrConstant(0, dl));
7035   }
7036
7037   // Vector has one or more non-const elements
7038   uint64_t Immediate = 0;
7039   SmallVector<unsigned, 16> NonConstIdx;
7040   bool IsSplat = true;
7041   bool HasConstElts = false;
7042   int SplatIdx = -1;
7043   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7044     SDValue In = Op.getOperand(idx);
7045     if (In.isUndef())
7046       continue;
7047     if (!isa<ConstantSDNode>(In))
7048       NonConstIdx.push_back(idx);
7049     else {
7050       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7051       HasConstElts = true;
7052     }
7053     if (SplatIdx < 0)
7054       SplatIdx = idx;
7055     else if (In != Op.getOperand(SplatIdx))
7056       IsSplat = false;
7057   }
7058
7059   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7060   if (IsSplat)
7061     return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7062                          DAG.getConstant(1, dl, VT),
7063                          DAG.getConstant(0, dl, VT));
7064
7065   // insert elements one by one
7066   SDValue DstVec;
7067   SDValue Imm;
7068   if (Immediate) {
7069     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7070     Imm = DAG.getConstant(Immediate, dl, ImmVT);
7071   }
7072   else if (HasConstElts)
7073     Imm = DAG.getConstant(0, dl, VT);
7074   else
7075     Imm = DAG.getUNDEF(VT);
7076   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7077     DstVec = DAG.getBitcast(VT, Imm);
7078   else {
7079     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7080     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7081                          DAG.getIntPtrConstant(0, dl));
7082   }
7083
7084   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7085     unsigned InsertIdx = NonConstIdx[i];
7086     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7087                          Op.getOperand(InsertIdx),
7088                          DAG.getIntPtrConstant(InsertIdx, dl));
7089   }
7090   return DstVec;
7091 }
7092
7093 /// \brief Return true if \p N implements a horizontal binop and return the
7094 /// operands for the horizontal binop into V0 and V1.
7095 ///
7096 /// This is a helper function of LowerToHorizontalOp().
7097 /// This function checks that the build_vector \p N in input implements a
7098 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7099 /// operation to match.
7100 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7101 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7102 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7103 /// arithmetic sub.
7104 ///
7105 /// This function only analyzes elements of \p N whose indices are
7106 /// in range [BaseIdx, LastIdx).
7107 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7108                               SelectionDAG &DAG,
7109                               unsigned BaseIdx, unsigned LastIdx,
7110                               SDValue &V0, SDValue &V1) {
7111   EVT VT = N->getValueType(0);
7112
7113   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7114   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7115          "Invalid Vector in input!");
7116
7117   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7118   bool CanFold = true;
7119   unsigned ExpectedVExtractIdx = BaseIdx;
7120   unsigned NumElts = LastIdx - BaseIdx;
7121   V0 = DAG.getUNDEF(VT);
7122   V1 = DAG.getUNDEF(VT);
7123
7124   // Check if N implements a horizontal binop.
7125   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7126     SDValue Op = N->getOperand(i + BaseIdx);
7127
7128     // Skip UNDEFs.
7129     if (Op->isUndef()) {
7130       // Update the expected vector extract index.
7131       if (i * 2 == NumElts)
7132         ExpectedVExtractIdx = BaseIdx;
7133       ExpectedVExtractIdx += 2;
7134       continue;
7135     }
7136
7137     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7138
7139     if (!CanFold)
7140       break;
7141
7142     SDValue Op0 = Op.getOperand(0);
7143     SDValue Op1 = Op.getOperand(1);
7144
7145     // Try to match the following pattern:
7146     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7147     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7148         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7149         Op0.getOperand(0) == Op1.getOperand(0) &&
7150         isa<ConstantSDNode>(Op0.getOperand(1)) &&
7151         isa<ConstantSDNode>(Op1.getOperand(1)));
7152     if (!CanFold)
7153       break;
7154
7155     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7156     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7157
7158     if (i * 2 < NumElts) {
7159       if (V0.isUndef()) {
7160         V0 = Op0.getOperand(0);
7161         if (V0.getValueType() != VT)
7162           return false;
7163       }
7164     } else {
7165       if (V1.isUndef()) {
7166         V1 = Op0.getOperand(0);
7167         if (V1.getValueType() != VT)
7168           return false;
7169       }
7170       if (i * 2 == NumElts)
7171         ExpectedVExtractIdx = BaseIdx;
7172     }
7173
7174     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7175     if (I0 == ExpectedVExtractIdx)
7176       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7177     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7178       // Try to match the following dag sequence:
7179       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7180       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7181     } else
7182       CanFold = false;
7183
7184     ExpectedVExtractIdx += 2;
7185   }
7186
7187   return CanFold;
7188 }
7189
7190 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7191 /// a concat_vector.
7192 ///
7193 /// This is a helper function of LowerToHorizontalOp().
7194 /// This function expects two 256-bit vectors called V0 and V1.
7195 /// At first, each vector is split into two separate 128-bit vectors.
7196 /// Then, the resulting 128-bit vectors are used to implement two
7197 /// horizontal binary operations.
7198 ///
7199 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7200 ///
7201 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7202 /// the two new horizontal binop.
7203 /// When Mode is set, the first horizontal binop dag node would take as input
7204 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7205 /// horizontal binop dag node would take as input the lower 128-bit of V1
7206 /// and the upper 128-bit of V1.
7207 ///   Example:
7208 ///     HADD V0_LO, V0_HI
7209 ///     HADD V1_LO, V1_HI
7210 ///
7211 /// Otherwise, the first horizontal binop dag node takes as input the lower
7212 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7213 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7214 ///   Example:
7215 ///     HADD V0_LO, V1_LO
7216 ///     HADD V0_HI, V1_HI
7217 ///
7218 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7219 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7220 /// the upper 128-bits of the result.
7221 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7222                                      const SDLoc &DL, SelectionDAG &DAG,
7223                                      unsigned X86Opcode, bool Mode,
7224                                      bool isUndefLO, bool isUndefHI) {
7225   MVT VT = V0.getSimpleValueType();
7226   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7227          "Invalid nodes in input!");
7228
7229   unsigned NumElts = VT.getVectorNumElements();
7230   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7231   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7232   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7233   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7234   MVT NewVT = V0_LO.getSimpleValueType();
7235
7236   SDValue LO = DAG.getUNDEF(NewVT);
7237   SDValue HI = DAG.getUNDEF(NewVT);
7238
7239   if (Mode) {
7240     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7241     if (!isUndefLO && !V0->isUndef())
7242       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7243     if (!isUndefHI && !V1->isUndef())
7244       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7245   } else {
7246     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7247     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7248       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7249
7250     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7251       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7252   }
7253
7254   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7255 }
7256
7257 /// Returns true iff \p BV builds a vector with the result equivalent to
7258 /// the result of ADDSUB operation.
7259 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7260 /// are written to the parameters \p Opnd0 and \p Opnd1.
7261 static bool isAddSub(const BuildVectorSDNode *BV,
7262                      const X86Subtarget &Subtarget, SelectionDAG &DAG,
7263                      SDValue &Opnd0, SDValue &Opnd1) {
7264
7265   MVT VT = BV->getSimpleValueType(0);
7266   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7267       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7268       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7269     return false;
7270
7271   unsigned NumElts = VT.getVectorNumElements();
7272   SDValue InVec0 = DAG.getUNDEF(VT);
7273   SDValue InVec1 = DAG.getUNDEF(VT);
7274
7275   // Odd-numbered elements in the input build vector are obtained from
7276   // adding two integer/float elements.
7277   // Even-numbered elements in the input build vector are obtained from
7278   // subtracting two integer/float elements.
7279   unsigned ExpectedOpcode = ISD::FSUB;
7280   unsigned NextExpectedOpcode = ISD::FADD;
7281   bool AddFound = false;
7282   bool SubFound = false;
7283
7284   for (unsigned i = 0, e = NumElts; i != e; ++i) {
7285     SDValue Op = BV->getOperand(i);
7286
7287     // Skip 'undef' values.
7288     unsigned Opcode = Op.getOpcode();
7289     if (Opcode == ISD::UNDEF) {
7290       std::swap(ExpectedOpcode, NextExpectedOpcode);
7291       continue;
7292     }
7293
7294     // Early exit if we found an unexpected opcode.
7295     if (Opcode != ExpectedOpcode)
7296       return false;
7297
7298     SDValue Op0 = Op.getOperand(0);
7299     SDValue Op1 = Op.getOperand(1);
7300
7301     // Try to match the following pattern:
7302     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7303     // Early exit if we cannot match that sequence.
7304     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7305         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7306         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7307         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7308         Op0.getOperand(1) != Op1.getOperand(1))
7309       return false;
7310
7311     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7312     if (I0 != i)
7313       return false;
7314
7315     // We found a valid add/sub node. Update the information accordingly.
7316     if (i & 1)
7317       AddFound = true;
7318     else
7319       SubFound = true;
7320
7321     // Update InVec0 and InVec1.
7322     if (InVec0.isUndef()) {
7323       InVec0 = Op0.getOperand(0);
7324       if (InVec0.getSimpleValueType() != VT)
7325         return false;
7326     }
7327     if (InVec1.isUndef()) {
7328       InVec1 = Op1.getOperand(0);
7329       if (InVec1.getSimpleValueType() != VT)
7330         return false;
7331     }
7332
7333     // Make sure that operands in input to each add/sub node always
7334     // come from a same pair of vectors.
7335     if (InVec0 != Op0.getOperand(0)) {
7336       if (ExpectedOpcode == ISD::FSUB)
7337         return false;
7338
7339       // FADD is commutable. Try to commute the operands
7340       // and then test again.
7341       std::swap(Op0, Op1);
7342       if (InVec0 != Op0.getOperand(0))
7343         return false;
7344     }
7345
7346     if (InVec1 != Op1.getOperand(0))
7347       return false;
7348
7349     // Update the pair of expected opcodes.
7350     std::swap(ExpectedOpcode, NextExpectedOpcode);
7351   }
7352
7353   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7354   if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7355     return false;
7356
7357   Opnd0 = InVec0;
7358   Opnd1 = InVec1;
7359   return true;
7360 }
7361
7362 /// Returns true if is possible to fold MUL and an idiom that has already been
7363 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7364 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7365 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7366 ///
7367 /// Prior to calling this function it should be known that there is some
7368 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7369 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7370 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7371 /// of \p Opnd0 uses is expected to be equal to 2.
7372 /// For example, this function may be called for the following IR:
7373 ///    %AB = fmul fast <2 x double> %A, %B
7374 ///    %Sub = fsub fast <2 x double> %AB, %C
7375 ///    %Add = fadd fast <2 x double> %AB, %C
7376 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7377 ///                            <2 x i32> <i32 0, i32 3>
7378 /// There is a def for %Addsub here, which potentially can be replaced by
7379 /// X86ISD::ADDSUB operation:
7380 ///    %Addsub = X86ISD::ADDSUB %AB, %C
7381 /// and such ADDSUB can further be replaced with FMADDSUB:
7382 ///    %Addsub = FMADDSUB %A, %B, %C.
7383 ///
7384 /// The main reason why this method is called before the replacement of the
7385 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7386 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7387 /// FMADDSUB is.
7388 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7389                        SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7390   if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7391       !Subtarget.hasAnyFMA())
7392     return false;
7393
7394   // FIXME: These checks must match the similar ones in
7395   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7396   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7397   // or MUL + ADDSUB to FMADDSUB.
7398   const TargetOptions &Options = DAG.getTarget().Options;
7399   bool AllowFusion =
7400       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7401   if (!AllowFusion)
7402     return false;
7403
7404   Opnd2 = Opnd1;
7405   Opnd1 = Opnd0.getOperand(1);
7406   Opnd0 = Opnd0.getOperand(0);
7407
7408   return true;
7409 }
7410
7411 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7412 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7413 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7414                                        const X86Subtarget &Subtarget,
7415                                        SelectionDAG &DAG) {
7416   SDValue Opnd0, Opnd1;
7417   if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7418     return SDValue();
7419
7420   MVT VT = BV->getSimpleValueType(0);
7421   SDLoc DL(BV);
7422
7423   // Try to generate X86ISD::FMADDSUB node here.
7424   SDValue Opnd2;
7425   if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7426     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7427
7428   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7429   // the ADDSUB idiom has been successfully recognized. There are no known
7430   // X86 targets with 512-bit ADDSUB instructions!
7431   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7432   // recognition.
7433   if (VT.is512BitVector())
7434     return SDValue();
7435
7436   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7437 }
7438
7439 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7440 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7441                                    const X86Subtarget &Subtarget,
7442                                    SelectionDAG &DAG) {
7443   MVT VT = BV->getSimpleValueType(0);
7444   unsigned NumElts = VT.getVectorNumElements();
7445   unsigned NumUndefsLO = 0;
7446   unsigned NumUndefsHI = 0;
7447   unsigned Half = NumElts/2;
7448
7449   // Count the number of UNDEF operands in the build_vector in input.
7450   for (unsigned i = 0, e = Half; i != e; ++i)
7451     if (BV->getOperand(i)->isUndef())
7452       NumUndefsLO++;
7453
7454   for (unsigned i = Half, e = NumElts; i != e; ++i)
7455     if (BV->getOperand(i)->isUndef())
7456       NumUndefsHI++;
7457
7458   // Early exit if this is either a build_vector of all UNDEFs or all the
7459   // operands but one are UNDEF.
7460   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7461     return SDValue();
7462
7463   SDLoc DL(BV);
7464   SDValue InVec0, InVec1;
7465   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7466     // Try to match an SSE3 float HADD/HSUB.
7467     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7468       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7469
7470     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7471       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7472   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7473     // Try to match an SSSE3 integer HADD/HSUB.
7474     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7475       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7476
7477     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7478       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7479   }
7480
7481   if (!Subtarget.hasAVX())
7482     return SDValue();
7483
7484   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7485     // Try to match an AVX horizontal add/sub of packed single/double
7486     // precision floating point values from 256-bit vectors.
7487     SDValue InVec2, InVec3;
7488     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7489         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7490         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7491         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7492       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7493
7494     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7495         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7496         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7497         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7498       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7499   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7500     // Try to match an AVX2 horizontal add/sub of signed integers.
7501     SDValue InVec2, InVec3;
7502     unsigned X86Opcode;
7503     bool CanFold = true;
7504
7505     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7506         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7507         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7508         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7509       X86Opcode = X86ISD::HADD;
7510     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7511         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7512         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7513         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7514       X86Opcode = X86ISD::HSUB;
7515     else
7516       CanFold = false;
7517
7518     if (CanFold) {
7519       // Fold this build_vector into a single horizontal add/sub.
7520       // Do this only if the target has AVX2.
7521       if (Subtarget.hasAVX2())
7522         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7523
7524       // Do not try to expand this build_vector into a pair of horizontal
7525       // add/sub if we can emit a pair of scalar add/sub.
7526       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7527         return SDValue();
7528
7529       // Convert this build_vector into a pair of horizontal binop followed by
7530       // a concat vector.
7531       bool isUndefLO = NumUndefsLO == Half;
7532       bool isUndefHI = NumUndefsHI == Half;
7533       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7534                                    isUndefLO, isUndefHI);
7535     }
7536   }
7537
7538   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7539        VT == MVT::v16i16) && Subtarget.hasAVX()) {
7540     unsigned X86Opcode;
7541     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7542       X86Opcode = X86ISD::HADD;
7543     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7544       X86Opcode = X86ISD::HSUB;
7545     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7546       X86Opcode = X86ISD::FHADD;
7547     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7548       X86Opcode = X86ISD::FHSUB;
7549     else
7550       return SDValue();
7551
7552     // Don't try to expand this build_vector into a pair of horizontal add/sub
7553     // if we can simply emit a pair of scalar add/sub.
7554     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7555       return SDValue();
7556
7557     // Convert this build_vector into two horizontal add/sub followed by
7558     // a concat vector.
7559     bool isUndefLO = NumUndefsLO == Half;
7560     bool isUndefHI = NumUndefsHI == Half;
7561     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7562                                  isUndefLO, isUndefHI);
7563   }
7564
7565   return SDValue();
7566 }
7567
7568 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7569 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7570 /// just apply the bit to the vectors.
7571 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7572 /// from this, but enough scalar bit operations are created from the later
7573 /// legalization + scalarization stages to need basic support.
7574 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7575                                        SelectionDAG &DAG) {
7576   SDLoc DL(Op);
7577   MVT VT = Op->getSimpleValueType(0);
7578   unsigned NumElems = VT.getVectorNumElements();
7579   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7580
7581   // Check that all elements have the same opcode.
7582   // TODO: Should we allow UNDEFS and if so how many?
7583   unsigned Opcode = Op->getOperand(0).getOpcode();
7584   for (unsigned i = 1; i < NumElems; ++i)
7585     if (Opcode != Op->getOperand(i).getOpcode())
7586       return SDValue();
7587
7588   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7589   switch (Opcode) {
7590   default:
7591     return SDValue();
7592   case ISD::AND:
7593   case ISD::XOR:
7594   case ISD::OR:
7595     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7596       return SDValue();
7597     break;
7598   }
7599
7600   SmallVector<SDValue, 4> LHSElts, RHSElts;
7601   for (SDValue Elt : Op->ops()) {
7602     SDValue LHS = Elt.getOperand(0);
7603     SDValue RHS = Elt.getOperand(1);
7604
7605     // We expect the canonicalized RHS operand to be the constant.
7606     if (!isa<ConstantSDNode>(RHS))
7607       return SDValue();
7608     LHSElts.push_back(LHS);
7609     RHSElts.push_back(RHS);
7610   }
7611
7612   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7613   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7614   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7615 }
7616
7617 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7618 /// functionality to do this, so it's all zeros, all ones, or some derivation
7619 /// that is cheap to calculate.
7620 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7621                                          const X86Subtarget &Subtarget) {
7622   SDLoc DL(Op);
7623   MVT VT = Op.getSimpleValueType();
7624
7625   // Vectors containing all zeros can be matched by pxor and xorps.
7626   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7627     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7628     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7629     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7630       return Op;
7631
7632     return getZeroVector(VT, Subtarget, DAG, DL);
7633   }
7634
7635   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7636   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7637   // vpcmpeqd on 256-bit vectors.
7638   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7639     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7640         (VT == MVT::v8i32 && Subtarget.hasInt256()))
7641       return Op;
7642
7643     return getOnesVector(VT, DAG, DL);
7644   }
7645
7646   return SDValue();
7647 }
7648
7649 SDValue
7650 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7651   SDLoc dl(Op);
7652
7653   MVT VT = Op.getSimpleValueType();
7654   MVT ExtVT = VT.getVectorElementType();
7655   unsigned NumElems = Op.getNumOperands();
7656
7657   // Generate vectors for predicate vectors.
7658   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7659     return LowerBUILD_VECTORvXi1(Op, DAG);
7660
7661   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7662     return VectorConstant;
7663
7664   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7665   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7666     return AddSub;
7667   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7668     return HorizontalOp;
7669   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7670     return Broadcast;
7671   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7672     return BitOp;
7673
7674   unsigned EVTBits = ExtVT.getSizeInBits();
7675
7676   unsigned NumZero  = 0;
7677   unsigned NumNonZero = 0;
7678   uint64_t NonZeros = 0;
7679   bool IsAllConstants = true;
7680   SmallSet<SDValue, 8> Values;
7681   for (unsigned i = 0; i < NumElems; ++i) {
7682     SDValue Elt = Op.getOperand(i);
7683     if (Elt.isUndef())
7684       continue;
7685     Values.insert(Elt);
7686     if (Elt.getOpcode() != ISD::Constant &&
7687         Elt.getOpcode() != ISD::ConstantFP)
7688       IsAllConstants = false;
7689     if (X86::isZeroNode(Elt))
7690       NumZero++;
7691     else {
7692       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7693       NonZeros |= ((uint64_t)1 << i);
7694       NumNonZero++;
7695     }
7696   }
7697
7698   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
7699   if (NumNonZero == 0)
7700     return DAG.getUNDEF(VT);
7701
7702   // Special case for single non-zero, non-undef, element.
7703   if (NumNonZero == 1) {
7704     unsigned Idx = countTrailingZeros(NonZeros);
7705     SDValue Item = Op.getOperand(Idx);
7706
7707     // If this is an insertion of an i64 value on x86-32, and if the top bits of
7708     // the value are obviously zero, truncate the value to i32 and do the
7709     // insertion that way.  Only do this if the value is non-constant or if the
7710     // value is a constant being inserted into element 0.  It is cheaper to do
7711     // a constant pool load than it is to do a movd + shuffle.
7712     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7713         (!IsAllConstants || Idx == 0)) {
7714       if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7715         // Handle SSE only.
7716         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7717         MVT VecVT = MVT::v4i32;
7718
7719         // Truncate the value (which may itself be a constant) to i32, and
7720         // convert it to a vector with movd (S2V+shuffle to zero extend).
7721         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7722         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7723         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7724                                       Item, Idx * 2, true, Subtarget, DAG));
7725       }
7726     }
7727
7728     // If we have a constant or non-constant insertion into the low element of
7729     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7730     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
7731     // depending on what the source datatype is.
7732     if (Idx == 0) {
7733       if (NumZero == 0)
7734         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7735
7736       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7737           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7738         assert((VT.is128BitVector() || VT.is256BitVector() ||
7739                 VT.is512BitVector()) &&
7740                "Expected an SSE value type!");
7741         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7742         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7743         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7744       }
7745
7746       // We can't directly insert an i8 or i16 into a vector, so zero extend
7747       // it to i32 first.
7748       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7749         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7750         if (VT.getSizeInBits() >= 256) {
7751           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7752           if (Subtarget.hasAVX()) {
7753             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7754             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7755           } else {
7756             // Without AVX, we need to extend to a 128-bit vector and then
7757             // insert into the 256-bit vector.
7758             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7759             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7760             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7761           }
7762         } else {
7763           assert(VT.is128BitVector() && "Expected an SSE value type!");
7764           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7765           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7766         }
7767         return DAG.getBitcast(VT, Item);
7768       }
7769     }
7770
7771     // Is it a vector logical left shift?
7772     if (NumElems == 2 && Idx == 1 &&
7773         X86::isZeroNode(Op.getOperand(0)) &&
7774         !X86::isZeroNode(Op.getOperand(1))) {
7775       unsigned NumBits = VT.getSizeInBits();
7776       return getVShift(true, VT,
7777                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7778                                    VT, Op.getOperand(1)),
7779                        NumBits/2, DAG, *this, dl);
7780     }
7781
7782     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7783       return SDValue();
7784
7785     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7786     // is a non-constant being inserted into an element other than the low one,
7787     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7788     // movd/movss) to move this into the low element, then shuffle it into
7789     // place.
7790     if (EVTBits == 32) {
7791       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7792       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7793     }
7794   }
7795
7796   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7797   if (Values.size() == 1) {
7798     if (EVTBits == 32) {
7799       // Instead of a shuffle like this:
7800       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7801       // Check if it's possible to issue this instead.
7802       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7803       unsigned Idx = countTrailingZeros(NonZeros);
7804       SDValue Item = Op.getOperand(Idx);
7805       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7806         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7807     }
7808     return SDValue();
7809   }
7810
7811   // A vector full of immediates; various special cases are already
7812   // handled, so this is best done with a single constant-pool load.
7813   if (IsAllConstants)
7814     return SDValue();
7815
7816   // See if we can use a vector load to get all of the elements.
7817   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7818     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7819     if (SDValue LD =
7820             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7821       return LD;
7822   }
7823
7824   // For AVX-length vectors, build the individual 128-bit pieces and use
7825   // shuffles to put them in place.
7826   if (VT.is256BitVector() || VT.is512BitVector()) {
7827     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7828
7829     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7830
7831     // Build both the lower and upper subvector.
7832     SDValue Lower =
7833         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7834     SDValue Upper = DAG.getBuildVector(
7835         HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7836
7837     // Recreate the wider vector with the lower and upper part.
7838     if (VT.is256BitVector())
7839       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7840     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7841   }
7842
7843   // Let legalizer expand 2-wide build_vectors.
7844   if (EVTBits == 64) {
7845     if (NumNonZero == 1) {
7846       // One half is zero or undef.
7847       unsigned Idx = countTrailingZeros(NonZeros);
7848       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7849                                Op.getOperand(Idx));
7850       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7851     }
7852     return SDValue();
7853   }
7854
7855   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7856   if (EVTBits == 8 && NumElems == 16)
7857     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7858                                           DAG, Subtarget))
7859       return V;
7860
7861   if (EVTBits == 16 && NumElems == 8)
7862     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7863                                           DAG, Subtarget))
7864       return V;
7865
7866   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7867   if (EVTBits == 32 && NumElems == 4)
7868     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7869       return V;
7870
7871   // If element VT is == 32 bits, turn it into a number of shuffles.
7872   if (NumElems == 4 && NumZero > 0) {
7873     SmallVector<SDValue, 8> Ops(NumElems);
7874     for (unsigned i = 0; i < 4; ++i) {
7875       bool isZero = !(NonZeros & (1ULL << i));
7876       if (isZero)
7877         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7878       else
7879         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7880     }
7881
7882     for (unsigned i = 0; i < 2; ++i) {
7883       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7884         default: break;
7885         case 0:
7886           Ops[i] = Ops[i*2];  // Must be a zero vector.
7887           break;
7888         case 1:
7889           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7890           break;
7891         case 2:
7892           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7893           break;
7894         case 3:
7895           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7896           break;
7897       }
7898     }
7899
7900     bool Reverse1 = (NonZeros & 0x3) == 2;
7901     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7902     int MaskVec[] = {
7903       Reverse1 ? 1 : 0,
7904       Reverse1 ? 0 : 1,
7905       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7906       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7907     };
7908     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7909   }
7910
7911   if (Values.size() > 1 && VT.is128BitVector()) {
7912     // Check for a build vector from mostly shuffle plus few inserting.
7913     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7914       return Sh;
7915
7916     // For SSE 4.1, use insertps to put the high elements into the low element.
7917     if (Subtarget.hasSSE41()) {
7918       SDValue Result;
7919       if (!Op.getOperand(0).isUndef())
7920         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7921       else
7922         Result = DAG.getUNDEF(VT);
7923
7924       for (unsigned i = 1; i < NumElems; ++i) {
7925         if (Op.getOperand(i).isUndef()) continue;
7926         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7927                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7928       }
7929       return Result;
7930     }
7931
7932     // Otherwise, expand into a number of unpckl*, start by extending each of
7933     // our (non-undef) elements to the full vector width with the element in the
7934     // bottom slot of the vector (which generates no code for SSE).
7935     SmallVector<SDValue, 8> Ops(NumElems);
7936     for (unsigned i = 0; i < NumElems; ++i) {
7937       if (!Op.getOperand(i).isUndef())
7938         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7939       else
7940         Ops[i] = DAG.getUNDEF(VT);
7941     }
7942
7943     // Next, we iteratively mix elements, e.g. for v4f32:
7944     //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7945     //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7946     //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
7947     for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
7948       // Generate scaled UNPCKL shuffle mask.
7949       SmallVector<int, 16> Mask;
7950       for(unsigned i = 0; i != Scale; ++i)
7951         Mask.push_back(i);
7952       for (unsigned i = 0; i != Scale; ++i)
7953         Mask.push_back(NumElems+i);
7954       Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7955
7956       for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
7957         Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
7958     }
7959     return Ops[0];
7960   }
7961   return SDValue();
7962 }
7963
7964 // 256-bit AVX can use the vinsertf128 instruction
7965 // to create 256-bit vectors from two other 128-bit ones.
7966 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7967   SDLoc dl(Op);
7968   MVT ResVT = Op.getSimpleValueType();
7969
7970   assert((ResVT.is256BitVector() ||
7971           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7972
7973   SDValue V1 = Op.getOperand(0);
7974   SDValue V2 = Op.getOperand(1);
7975   unsigned NumElems = ResVT.getVectorNumElements();
7976   if (ResVT.is256BitVector())
7977     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7978
7979   if (Op.getNumOperands() == 4) {
7980     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7981                                   ResVT.getVectorNumElements()/2);
7982     SDValue V3 = Op.getOperand(2);
7983     SDValue V4 = Op.getOperand(3);
7984     return concat256BitVectors(
7985         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7986         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7987         NumElems, DAG, dl);
7988   }
7989   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7990 }
7991
7992 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
7993 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
7994 static bool isExpandWithZeros(const SDValue &Op) {
7995   assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
7996          "Expand with zeros only possible in CONCAT_VECTORS nodes!");
7997
7998   for (unsigned i = 1; i < Op.getNumOperands(); i++)
7999     if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8000       return false;
8001
8002   return true;
8003 }
8004
8005 // Returns true if the given node is a type promotion (by concatenating i1
8006 // zeros) of the result of a node that already zeros all upper bits of
8007 // k-register.
8008 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8009   unsigned Opc = Op.getOpcode();
8010
8011   assert(Opc == ISD::CONCAT_VECTORS &&
8012          Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8013          "Unexpected node to check for type promotion!");
8014
8015   // As long as we are concatenating zeros to the upper part of a previous node
8016   // result, climb up the tree until a node with different opcode is
8017   // encountered
8018   while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8019     if (Opc == ISD::INSERT_SUBVECTOR) {
8020       if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8021           Op.getConstantOperandVal(2) == 0)
8022         Op = Op.getOperand(1);
8023       else
8024         return SDValue();
8025     } else { // Opc == ISD::CONCAT_VECTORS
8026       if (isExpandWithZeros(Op))
8027         Op = Op.getOperand(0);
8028       else
8029         return SDValue();
8030     }
8031     Opc = Op.getOpcode();
8032   }
8033
8034   // Check if the first inserted node zeroes the upper bits, or an 'and' result
8035   // of a node that zeros the upper bits (its masked version).
8036   if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8037       (Op.getOpcode() == ISD::AND &&
8038        (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8039         isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8040     return Op;
8041   }
8042
8043   return SDValue();
8044 }
8045
8046 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8047                                        const X86Subtarget &Subtarget,
8048                                        SelectionDAG & DAG) {
8049   SDLoc dl(Op);
8050   MVT ResVT = Op.getSimpleValueType();
8051   unsigned NumOfOperands = Op.getNumOperands();
8052
8053   assert(isPowerOf2_32(NumOfOperands) &&
8054          "Unexpected number of operands in CONCAT_VECTORS");
8055
8056   // If this node promotes - by concatenating zeroes - the type of the result
8057   // of a node with instruction that zeroes all upper (irrelevant) bits of the
8058   // output register, mark it as legal and catch the pattern in instruction
8059   // selection to avoid emitting extra insturctions (for zeroing upper bits).
8060   if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8061     SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
8062     SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
8063     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8064                        ZeroC);
8065   }
8066
8067   SDValue Undef = DAG.getUNDEF(ResVT);
8068   if (NumOfOperands > 2) {
8069     // Specialize the cases when all, or all but one, of the operands are undef.
8070     unsigned NumOfDefinedOps = 0;
8071     unsigned OpIdx = 0;
8072     for (unsigned i = 0; i < NumOfOperands; i++)
8073       if (!Op.getOperand(i).isUndef()) {
8074         NumOfDefinedOps++;
8075         OpIdx = i;
8076       }
8077     if (NumOfDefinedOps == 0)
8078       return Undef;
8079     if (NumOfDefinedOps == 1) {
8080       unsigned SubVecNumElts =
8081         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8082       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8083       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8084                          Op.getOperand(OpIdx), IdxVal);
8085     }
8086
8087     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8088                                   ResVT.getVectorNumElements()/2);
8089     SmallVector<SDValue, 2> Ops;
8090     for (unsigned i = 0; i < NumOfOperands/2; i++)
8091       Ops.push_back(Op.getOperand(i));
8092     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8093     Ops.clear();
8094     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8095       Ops.push_back(Op.getOperand(i));
8096     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8097     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8098   }
8099
8100   // 2 operands
8101   SDValue V1 = Op.getOperand(0);
8102   SDValue V2 = Op.getOperand(1);
8103   unsigned NumElems = ResVT.getVectorNumElements();
8104   assert(V1.getValueType() == V2.getValueType() &&
8105          V1.getValueType().getVectorNumElements() == NumElems/2 &&
8106          "Unexpected operands in CONCAT_VECTORS");
8107
8108   if (ResVT.getSizeInBits() >= 16)
8109     return Op; // The operation is legal with KUNPCK
8110
8111   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8112   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8113   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8114   if (IsZeroV1 && IsZeroV2)
8115     return ZeroVec;
8116
8117   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8118   if (V2.isUndef())
8119     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8120   if (IsZeroV2)
8121     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8122
8123   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8124   if (V1.isUndef())
8125     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8126
8127   if (IsZeroV1)
8128     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8129
8130   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8131   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8132 }
8133
8134 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8135                                    const X86Subtarget &Subtarget,
8136                                    SelectionDAG &DAG) {
8137   MVT VT = Op.getSimpleValueType();
8138   if (VT.getVectorElementType() == MVT::i1)
8139     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8140
8141   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8142          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8143           Op.getNumOperands() == 4)));
8144
8145   // AVX can use the vinsertf128 instruction to create 256-bit vectors
8146   // from two other 128-bit ones.
8147
8148   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8149   return LowerAVXCONCAT_VECTORS(Op, DAG);
8150 }
8151
8152 //===----------------------------------------------------------------------===//
8153 // Vector shuffle lowering
8154 //
8155 // This is an experimental code path for lowering vector shuffles on x86. It is
8156 // designed to handle arbitrary vector shuffles and blends, gracefully
8157 // degrading performance as necessary. It works hard to recognize idiomatic
8158 // shuffles and lower them to optimal instruction patterns without leaving
8159 // a framework that allows reasonably efficient handling of all vector shuffle
8160 // patterns.
8161 //===----------------------------------------------------------------------===//
8162
8163 /// \brief Tiny helper function to identify a no-op mask.
8164 ///
8165 /// This is a somewhat boring predicate function. It checks whether the mask
8166 /// array input, which is assumed to be a single-input shuffle mask of the kind
8167 /// used by the X86 shuffle instructions (not a fully general
8168 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8169 /// in-place shuffle are 'no-op's.
8170 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8171   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8172     assert(Mask[i] >= -1 && "Out of bound mask element!");
8173     if (Mask[i] >= 0 && Mask[i] != i)
8174       return false;
8175   }
8176   return true;
8177 }
8178
8179 /// \brief Test whether there are elements crossing 128-bit lanes in this
8180 /// shuffle mask.
8181 ///
8182 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8183 /// and we routinely test for these.
8184 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8185   int LaneSize = 128 / VT.getScalarSizeInBits();
8186   int Size = Mask.size();
8187   for (int i = 0; i < Size; ++i)
8188     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8189       return true;
8190   return false;
8191 }
8192
8193 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8194 ///
8195 /// This checks a shuffle mask to see if it is performing the same
8196 /// lane-relative shuffle in each sub-lane. This trivially implies
8197 /// that it is also not lane-crossing. It may however involve a blend from the
8198 /// same lane of a second vector.
8199 ///
8200 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8201 /// non-trivial to compute in the face of undef lanes. The representation is
8202 /// suitable for use with existing 128-bit shuffles as entries from the second
8203 /// vector have been remapped to [LaneSize, 2*LaneSize).
8204 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8205                                   ArrayRef<int> Mask,
8206                                   SmallVectorImpl<int> &RepeatedMask) {
8207   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8208   RepeatedMask.assign(LaneSize, -1);
8209   int Size = Mask.size();
8210   for (int i = 0; i < Size; ++i) {
8211     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8212     if (Mask[i] < 0)
8213       continue;
8214     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8215       // This entry crosses lanes, so there is no way to model this shuffle.
8216       return false;
8217
8218     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8219     // Adjust second vector indices to start at LaneSize instead of Size.
8220     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8221                                 : Mask[i] % LaneSize + LaneSize;
8222     if (RepeatedMask[i % LaneSize] < 0)
8223       // This is the first non-undef entry in this slot of a 128-bit lane.
8224       RepeatedMask[i % LaneSize] = LocalM;
8225     else if (RepeatedMask[i % LaneSize] != LocalM)
8226       // Found a mismatch with the repeated mask.
8227       return false;
8228   }
8229   return true;
8230 }
8231
8232 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8233 static bool
8234 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8235                                 SmallVectorImpl<int> &RepeatedMask) {
8236   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8237 }
8238
8239 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8240 static bool
8241 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8242                                 SmallVectorImpl<int> &RepeatedMask) {
8243   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8244 }
8245
8246 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8247 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8248 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8249                                         ArrayRef<int> Mask,
8250                                         SmallVectorImpl<int> &RepeatedMask) {
8251   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8252   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8253   int Size = Mask.size();
8254   for (int i = 0; i < Size; ++i) {
8255     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8256     if (Mask[i] == SM_SentinelUndef)
8257       continue;
8258     if (Mask[i] == SM_SentinelZero) {
8259       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8260         return false;
8261       RepeatedMask[i % LaneSize] = SM_SentinelZero;
8262       continue;
8263     }
8264     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8265       // This entry crosses lanes, so there is no way to model this shuffle.
8266       return false;
8267
8268     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8269     // Adjust second vector indices to start at LaneSize instead of Size.
8270     int LocalM =
8271         Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8272     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8273       // This is the first non-undef entry in this slot of a 128-bit lane.
8274       RepeatedMask[i % LaneSize] = LocalM;
8275     else if (RepeatedMask[i % LaneSize] != LocalM)
8276       // Found a mismatch with the repeated mask.
8277       return false;
8278   }
8279   return true;
8280 }
8281
8282 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8283 /// arguments.
8284 ///
8285 /// This is a fast way to test a shuffle mask against a fixed pattern:
8286 ///
8287 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8288 ///
8289 /// It returns true if the mask is exactly as wide as the argument list, and
8290 /// each element of the mask is either -1 (signifying undef) or the value given
8291 /// in the argument.
8292 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8293                                 ArrayRef<int> ExpectedMask) {
8294   if (Mask.size() != ExpectedMask.size())
8295     return false;
8296
8297   int Size = Mask.size();
8298
8299   // If the values are build vectors, we can look through them to find
8300   // equivalent inputs that make the shuffles equivalent.
8301   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8302   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8303
8304   for (int i = 0; i < Size; ++i) {
8305     assert(Mask[i] >= -1 && "Out of bound mask element!");
8306     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8307       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8308       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8309       if (!MaskBV || !ExpectedBV ||
8310           MaskBV->getOperand(Mask[i] % Size) !=
8311               ExpectedBV->getOperand(ExpectedMask[i] % Size))
8312         return false;
8313     }
8314   }
8315
8316   return true;
8317 }
8318
8319 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8320 ///
8321 /// The masks must be exactly the same width.
8322 ///
8323 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8324 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8325 ///
8326 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8327 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8328                                       ArrayRef<int> ExpectedMask) {
8329   int Size = Mask.size();
8330   if (Size != (int)ExpectedMask.size())
8331     return false;
8332
8333   for (int i = 0; i < Size; ++i)
8334     if (Mask[i] == SM_SentinelUndef)
8335       continue;
8336     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8337       return false;
8338     else if (Mask[i] != ExpectedMask[i])
8339       return false;
8340
8341   return true;
8342 }
8343
8344 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8345 // mask.
8346 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8347                                                     const APInt &Zeroable) {
8348   int NumElts = Mask.size();
8349   assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8350
8351   SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8352   for (int i = 0; i != NumElts; ++i) {
8353     int M = Mask[i];
8354     if (M == SM_SentinelUndef)
8355       continue;
8356     assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8357     TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8358   }
8359   return TargetMask;
8360 }
8361
8362 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8363 // instructions.
8364 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8365   if (VT != MVT::v8i32 && VT != MVT::v8f32)
8366     return false;
8367
8368   SmallVector<int, 8> Unpcklwd;
8369   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8370                           /* Unary = */ false);
8371   SmallVector<int, 8> Unpckhwd;
8372   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8373                           /* Unary = */ false);
8374   bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8375                          isTargetShuffleEquivalent(Mask, Unpckhwd));
8376   return IsUnpackwdMask;
8377 }
8378
8379 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8380 ///
8381 /// This helper function produces an 8-bit shuffle immediate corresponding to
8382 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8383 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8384 /// example.
8385 ///
8386 /// NB: We rely heavily on "undef" masks preserving the input lane.
8387 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8388   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8389   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8390   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8391   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8392   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8393
8394   unsigned Imm = 0;
8395   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8396   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8397   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8398   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8399   return Imm;
8400 }
8401
8402 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8403                                           SelectionDAG &DAG) {
8404   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8405 }
8406
8407 /// \brief Compute whether each element of a shuffle is zeroable.
8408 ///
8409 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8410 /// Either it is an undef element in the shuffle mask, the element of the input
8411 /// referenced is undef, or the element of the input referenced is known to be
8412 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8413 /// as many lanes with this technique as possible to simplify the remaining
8414 /// shuffle.
8415 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8416                                             SDValue V1, SDValue V2) {
8417   APInt Zeroable(Mask.size(), 0);
8418   V1 = peekThroughBitcasts(V1);
8419   V2 = peekThroughBitcasts(V2);
8420
8421   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8422   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8423
8424   int VectorSizeInBits = V1.getValueSizeInBits();
8425   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8426   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8427
8428   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8429     int M = Mask[i];
8430     // Handle the easy cases.
8431     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8432       Zeroable.setBit(i);
8433       continue;
8434     }
8435
8436     // Determine shuffle input and normalize the mask.
8437     SDValue V = M < Size ? V1 : V2;
8438     M %= Size;
8439
8440     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8441     if (V.getOpcode() != ISD::BUILD_VECTOR)
8442       continue;
8443
8444     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8445     // the (larger) source element must be UNDEF/ZERO.
8446     if ((Size % V.getNumOperands()) == 0) {
8447       int Scale = Size / V->getNumOperands();
8448       SDValue Op = V.getOperand(M / Scale);
8449       if (Op.isUndef() || X86::isZeroNode(Op))
8450         Zeroable.setBit(i);
8451       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8452         APInt Val = Cst->getAPIntValue();
8453         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8454         Val = Val.getLoBits(ScalarSizeInBits);
8455         if (Val == 0)
8456           Zeroable.setBit(i);
8457       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8458         APInt Val = Cst->getValueAPF().bitcastToAPInt();
8459         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8460         Val = Val.getLoBits(ScalarSizeInBits);
8461         if (Val == 0)
8462           Zeroable.setBit(i);
8463       }
8464       continue;
8465     }
8466
8467     // If the BUILD_VECTOR has more elements then all the (smaller) source
8468     // elements must be UNDEF or ZERO.
8469     if ((V.getNumOperands() % Size) == 0) {
8470       int Scale = V->getNumOperands() / Size;
8471       bool AllZeroable = true;
8472       for (int j = 0; j < Scale; ++j) {
8473         SDValue Op = V.getOperand((M * Scale) + j);
8474         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8475       }
8476       if (AllZeroable)
8477         Zeroable.setBit(i);
8478       continue;
8479     }
8480   }
8481
8482   return Zeroable;
8483 }
8484
8485 // The Shuffle result is as follow:
8486 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8487 // Each Zeroable's element correspond to a particular Mask's element.
8488 // As described in computeZeroableShuffleElements function.
8489 //
8490 // The function looks for a sub-mask that the nonzero elements are in
8491 // increasing order. If such sub-mask exist. The function returns true.
8492 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8493                                      ArrayRef<int> Mask, const EVT &VectorType,
8494                                      bool &IsZeroSideLeft) {
8495   int NextElement = -1;
8496   // Check if the Mask's nonzero elements are in increasing order.
8497   for (int i = 0, e = Mask.size(); i < e; i++) {
8498     // Checks if the mask's zeros elements are built from only zeros.
8499     assert(Mask[i] >= -1 && "Out of bound mask element!");
8500     if (Mask[i] < 0)
8501       return false;
8502     if (Zeroable[i])
8503       continue;
8504     // Find the lowest non zero element
8505     if (NextElement < 0) {
8506       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8507       IsZeroSideLeft = NextElement != 0;
8508     }
8509     // Exit if the mask's non zero elements are not in increasing order.
8510     if (NextElement != Mask[i])
8511       return false;
8512     NextElement++;
8513   }
8514   return true;
8515 }
8516
8517 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8518 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8519                                             ArrayRef<int> Mask, SDValue V1,
8520                                             SDValue V2,
8521                                             const APInt &Zeroable,
8522                                             const X86Subtarget &Subtarget,
8523                                             SelectionDAG &DAG) {
8524   int Size = Mask.size();
8525   int LaneSize = 128 / VT.getScalarSizeInBits();
8526   const int NumBytes = VT.getSizeInBits() / 8;
8527   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8528
8529   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8530          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8531          (Subtarget.hasBWI() && VT.is512BitVector()));
8532
8533   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8534   // Sign bit set in i8 mask means zero element.
8535   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8536
8537   SDValue V;
8538   for (int i = 0; i < NumBytes; ++i) {
8539     int M = Mask[i / NumEltBytes];
8540     if (M < 0) {
8541       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8542       continue;
8543     }
8544     if (Zeroable[i / NumEltBytes]) {
8545       PSHUFBMask[i] = ZeroMask;
8546       continue;
8547     }
8548
8549     // We can only use a single input of V1 or V2.
8550     SDValue SrcV = (M >= Size ? V2 : V1);
8551     if (V && V != SrcV)
8552       return SDValue();
8553     V = SrcV;
8554     M %= Size;
8555
8556     // PSHUFB can't cross lanes, ensure this doesn't happen.
8557     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8558       return SDValue();
8559
8560     M = M % LaneSize;
8561     M = M * NumEltBytes + (i % NumEltBytes);
8562     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8563   }
8564   assert(V && "Failed to find a source input");
8565
8566   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8567   return DAG.getBitcast(
8568       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8569                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8570 }
8571
8572 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8573                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
8574                            const SDLoc &dl);
8575
8576 // X86 has dedicated shuffle that can be lowered to VEXPAND
8577 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8578                                           const APInt &Zeroable,
8579                                           ArrayRef<int> Mask, SDValue &V1,
8580                                           SDValue &V2, SelectionDAG &DAG,
8581                                           const X86Subtarget &Subtarget) {
8582   bool IsLeftZeroSide = true;
8583   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8584                                 IsLeftZeroSide))
8585     return SDValue();
8586   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8587   MVT IntegerType =
8588       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8589   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8590   unsigned NumElts = VT.getVectorNumElements();
8591   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8592          "Unexpected number of vector elements");
8593   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8594                               Subtarget, DAG, DL);
8595   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8596   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8597   return DAG.getSelect(DL, VT, VMask,
8598                        DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8599                        ZeroVector);
8600 }
8601
8602 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8603                                         unsigned &UnpackOpcode, bool IsUnary,
8604                                         ArrayRef<int> TargetMask, SDLoc &DL,
8605                                         SelectionDAG &DAG,
8606                                         const X86Subtarget &Subtarget) {
8607   int NumElts = VT.getVectorNumElements();
8608
8609   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8610   for (int i = 0; i != NumElts; i += 2) {
8611     int M1 = TargetMask[i + 0];
8612     int M2 = TargetMask[i + 1];
8613     Undef1 &= (SM_SentinelUndef == M1);
8614     Undef2 &= (SM_SentinelUndef == M2);
8615     Zero1 &= isUndefOrZero(M1);
8616     Zero2 &= isUndefOrZero(M2);
8617   }
8618   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8619          "Zeroable shuffle detected");
8620
8621   // Attempt to match the target mask against the unpack lo/hi mask patterns.
8622   SmallVector<int, 64> Unpckl, Unpckh;
8623   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8624   if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8625     UnpackOpcode = X86ISD::UNPCKL;
8626     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8627     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8628     return true;
8629   }
8630
8631   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8632   if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8633     UnpackOpcode = X86ISD::UNPCKH;
8634     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8635     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8636     return true;
8637   }
8638
8639   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8640   if (IsUnary && (Zero1 || Zero2)) {
8641     // Don't bother if we can blend instead.
8642     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8643         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8644       return false;
8645
8646     bool MatchLo = true, MatchHi = true;
8647     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8648       int M = TargetMask[i];
8649
8650       // Ignore if the input is known to be zero or the index is undef.
8651       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8652           (M == SM_SentinelUndef))
8653         continue;
8654
8655       MatchLo &= (M == Unpckl[i]);
8656       MatchHi &= (M == Unpckh[i]);
8657     }
8658
8659     if (MatchLo || MatchHi) {
8660       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8661       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8662       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8663       return true;
8664     }
8665   }
8666
8667   // If a binary shuffle, commute and try again.
8668   if (!IsUnary) {
8669     ShuffleVectorSDNode::commuteMask(Unpckl);
8670     if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8671       UnpackOpcode = X86ISD::UNPCKL;
8672       std::swap(V1, V2);
8673       return true;
8674     }
8675
8676     ShuffleVectorSDNode::commuteMask(Unpckh);
8677     if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8678       UnpackOpcode = X86ISD::UNPCKH;
8679       std::swap(V1, V2);
8680       return true;
8681     }
8682   }
8683
8684   return false;
8685 }
8686
8687 // X86 has dedicated unpack instructions that can handle specific blend
8688 // operations: UNPCKH and UNPCKL.
8689 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8690                                            ArrayRef<int> Mask, SDValue V1,
8691                                            SDValue V2, SelectionDAG &DAG) {
8692   SmallVector<int, 8> Unpckl;
8693   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8694   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8695     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8696
8697   SmallVector<int, 8> Unpckh;
8698   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8699   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8700     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8701
8702   // Commute and try again.
8703   ShuffleVectorSDNode::commuteMask(Unpckl);
8704   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8705     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8706
8707   ShuffleVectorSDNode::commuteMask(Unpckh);
8708   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8709     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8710
8711   return SDValue();
8712 }
8713
8714 /// \brief Try to emit a bitmask instruction for a shuffle.
8715 ///
8716 /// This handles cases where we can model a blend exactly as a bitmask due to
8717 /// one of the inputs being zeroable.
8718 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8719                                            SDValue V2, ArrayRef<int> Mask,
8720                                            const APInt &Zeroable,
8721                                            SelectionDAG &DAG) {
8722   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8723   MVT EltVT = VT.getVectorElementType();
8724   SDValue Zero = DAG.getConstant(0, DL, EltVT);
8725   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8726   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8727   SDValue V;
8728   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8729     if (Zeroable[i])
8730       continue;
8731     if (Mask[i] % Size != i)
8732       return SDValue(); // Not a blend.
8733     if (!V)
8734       V = Mask[i] < Size ? V1 : V2;
8735     else if (V != (Mask[i] < Size ? V1 : V2))
8736       return SDValue(); // Can only let one input through the mask.
8737
8738     VMaskOps[i] = AllOnes;
8739   }
8740   if (!V)
8741     return SDValue(); // No non-zeroable elements!
8742
8743   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8744   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8745 }
8746
8747 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8748 ///
8749 /// This is used as a fallback approach when first class blend instructions are
8750 /// unavailable. Currently it is only suitable for integer vectors, but could
8751 /// be generalized for floating point vectors if desirable.
8752 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8753                                             SDValue V2, ArrayRef<int> Mask,
8754                                             SelectionDAG &DAG) {
8755   assert(VT.isInteger() && "Only supports integer vector types!");
8756   MVT EltVT = VT.getVectorElementType();
8757   SDValue Zero = DAG.getConstant(0, DL, EltVT);
8758   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8759   SmallVector<SDValue, 16> MaskOps;
8760   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8761     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8762       return SDValue(); // Shuffled input!
8763     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8764   }
8765
8766   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8767   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8768   // We have to cast V2 around.
8769   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8770   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8771                                       DAG.getBitcast(MaskVT, V1Mask),
8772                                       DAG.getBitcast(MaskVT, V2)));
8773   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8774 }
8775
8776 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8777                                     SDValue PreservedSrc,
8778                                     const X86Subtarget &Subtarget,
8779                                     SelectionDAG &DAG);
8780
8781 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8782                                       MutableArrayRef<int> TargetMask,
8783                                       bool &ForceV1Zero, bool &ForceV2Zero,
8784                                       uint64_t &BlendMask) {
8785   bool V1IsZeroOrUndef =
8786       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8787   bool V2IsZeroOrUndef =
8788       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8789
8790   BlendMask = 0;
8791   ForceV1Zero = false, ForceV2Zero = false;
8792   assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8793
8794   // Attempt to generate the binary blend mask. If an input is zero then
8795   // we can use any lane.
8796   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8797   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8798     int M = TargetMask[i];
8799     if (M == SM_SentinelUndef)
8800       continue;
8801     if (M == i)
8802       continue;
8803     if (M == i + Size) {
8804       BlendMask |= 1ull << i;
8805       continue;
8806     }
8807     if (M == SM_SentinelZero) {
8808       if (V1IsZeroOrUndef) {
8809         ForceV1Zero = true;
8810         TargetMask[i] = i;
8811         continue;
8812       }
8813       if (V2IsZeroOrUndef) {
8814         ForceV2Zero = true;
8815         BlendMask |= 1ull << i;
8816         TargetMask[i] = i + Size;
8817         continue;
8818       }
8819     }
8820     return false;
8821   }
8822   return true;
8823 }
8824
8825 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8826   uint64_t ScaledMask = 0;
8827   for (int i = 0; i != Size; ++i)
8828     if (BlendMask & (1ull << i))
8829       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8830   return ScaledMask;
8831 }
8832
8833 /// \brief Try to emit a blend instruction for a shuffle.
8834 ///
8835 /// This doesn't do any checks for the availability of instructions for blending
8836 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8837 /// be matched in the backend with the type given. What it does check for is
8838 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8839 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8840                                          SDValue V2, ArrayRef<int> Original,
8841                                          const APInt &Zeroable,
8842                                          const X86Subtarget &Subtarget,
8843                                          SelectionDAG &DAG) {
8844   SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8845
8846   uint64_t BlendMask = 0;
8847   bool ForceV1Zero = false, ForceV2Zero = false;
8848   if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8849                                  BlendMask))
8850     return SDValue();
8851
8852   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8853   if (ForceV1Zero)
8854     V1 = getZeroVector(VT, Subtarget, DAG, DL);
8855   if (ForceV2Zero)
8856     V2 = getZeroVector(VT, Subtarget, DAG, DL);
8857
8858   switch (VT.SimpleTy) {
8859   case MVT::v2f64:
8860   case MVT::v4f32:
8861   case MVT::v4f64:
8862   case MVT::v8f32:
8863     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8864                        DAG.getConstant(BlendMask, DL, MVT::i8));
8865
8866   case MVT::v4i64:
8867   case MVT::v8i32:
8868     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8869     LLVM_FALLTHROUGH;
8870   case MVT::v2i64:
8871   case MVT::v4i32:
8872     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8873     // that instruction.
8874     if (Subtarget.hasAVX2()) {
8875       // Scale the blend by the number of 32-bit dwords per element.
8876       int Scale =  VT.getScalarSizeInBits() / 32;
8877       BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8878       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8879       V1 = DAG.getBitcast(BlendVT, V1);
8880       V2 = DAG.getBitcast(BlendVT, V2);
8881       return DAG.getBitcast(
8882           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8883                           DAG.getConstant(BlendMask, DL, MVT::i8)));
8884     }
8885     LLVM_FALLTHROUGH;
8886   case MVT::v8i16: {
8887     // For integer shuffles we need to expand the mask and cast the inputs to
8888     // v8i16s prior to blending.
8889     int Scale = 8 / VT.getVectorNumElements();
8890     BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8891     V1 = DAG.getBitcast(MVT::v8i16, V1);
8892     V2 = DAG.getBitcast(MVT::v8i16, V2);
8893     return DAG.getBitcast(VT,
8894                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8895                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
8896   }
8897
8898   case MVT::v16i16: {
8899     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8900     SmallVector<int, 8> RepeatedMask;
8901     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8902       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8903       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8904       BlendMask = 0;
8905       for (int i = 0; i < 8; ++i)
8906         if (RepeatedMask[i] >= 8)
8907           BlendMask |= 1ull << i;
8908       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8909                          DAG.getConstant(BlendMask, DL, MVT::i8));
8910     }
8911     LLVM_FALLTHROUGH;
8912   }
8913   case MVT::v16i8:
8914   case MVT::v32i8: {
8915     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8916            "256-bit byte-blends require AVX2 support!");
8917
8918     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8919       MVT IntegerType =
8920           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8921       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8922       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8923     }
8924
8925     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8926     if (SDValue Masked =
8927             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8928       return Masked;
8929
8930     // Scale the blend by the number of bytes per element.
8931     int Scale = VT.getScalarSizeInBits() / 8;
8932
8933     // This form of blend is always done on bytes. Compute the byte vector
8934     // type.
8935     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8936
8937     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8938     // mix of LLVM's code generator and the x86 backend. We tell the code
8939     // generator that boolean values in the elements of an x86 vector register
8940     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8941     // mapping a select to operand #1, and 'false' mapping to operand #2. The
8942     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8943     // of the element (the remaining are ignored) and 0 in that high bit would
8944     // mean operand #1 while 1 in the high bit would mean operand #2. So while
8945     // the LLVM model for boolean values in vector elements gets the relevant
8946     // bit set, it is set backwards and over constrained relative to x86's
8947     // actual model.
8948     SmallVector<SDValue, 32> VSELECTMask;
8949     for (int i = 0, Size = Mask.size(); i < Size; ++i)
8950       for (int j = 0; j < Scale; ++j)
8951         VSELECTMask.push_back(
8952             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8953                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8954                                           MVT::i8));
8955
8956     V1 = DAG.getBitcast(BlendVT, V1);
8957     V2 = DAG.getBitcast(BlendVT, V2);
8958     return DAG.getBitcast(
8959         VT,
8960         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8961                       V1, V2));
8962   }
8963   case MVT::v16f32:
8964   case MVT::v8f64:
8965   case MVT::v8i64:
8966   case MVT::v16i32:
8967   case MVT::v32i16:
8968   case MVT::v64i8: {
8969     MVT IntegerType =
8970         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8971     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8972     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8973   }
8974   default:
8975     llvm_unreachable("Not a supported integer vector type!");
8976   }
8977 }
8978
8979 /// \brief Try to lower as a blend of elements from two inputs followed by
8980 /// a single-input permutation.
8981 ///
8982 /// This matches the pattern where we can blend elements from two inputs and
8983 /// then reduce the shuffle to a single-input permutation.
8984 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8985                                                    SDValue V1, SDValue V2,
8986                                                    ArrayRef<int> Mask,
8987                                                    SelectionDAG &DAG) {
8988   // We build up the blend mask while checking whether a blend is a viable way
8989   // to reduce the shuffle.
8990   SmallVector<int, 32> BlendMask(Mask.size(), -1);
8991   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8992
8993   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8994     if (Mask[i] < 0)
8995       continue;
8996
8997     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8998
8999     if (BlendMask[Mask[i] % Size] < 0)
9000       BlendMask[Mask[i] % Size] = Mask[i];
9001     else if (BlendMask[Mask[i] % Size] != Mask[i])
9002       return SDValue(); // Can't blend in the needed input!
9003
9004     PermuteMask[i] = Mask[i] % Size;
9005   }
9006
9007   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9008   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9009 }
9010
9011 /// \brief Generic routine to decompose a shuffle and blend into independent
9012 /// blends and permutes.
9013 ///
9014 /// This matches the extremely common pattern for handling combined
9015 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9016 /// operations. It will try to pick the best arrangement of shuffles and
9017 /// blends.
9018 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9019                                                           MVT VT, SDValue V1,
9020                                                           SDValue V2,
9021                                                           ArrayRef<int> Mask,
9022                                                           SelectionDAG &DAG) {
9023   // Shuffle the input elements into the desired positions in V1 and V2 and
9024   // blend them together.
9025   SmallVector<int, 32> V1Mask(Mask.size(), -1);
9026   SmallVector<int, 32> V2Mask(Mask.size(), -1);
9027   SmallVector<int, 32> BlendMask(Mask.size(), -1);
9028   for (int i = 0, Size = Mask.size(); i < Size; ++i)
9029     if (Mask[i] >= 0 && Mask[i] < Size) {
9030       V1Mask[i] = Mask[i];
9031       BlendMask[i] = i;
9032     } else if (Mask[i] >= Size) {
9033       V2Mask[i] = Mask[i] - Size;
9034       BlendMask[i] = i + Size;
9035     }
9036
9037   // Try to lower with the simpler initial blend strategy unless one of the
9038   // input shuffles would be a no-op. We prefer to shuffle inputs as the
9039   // shuffle may be able to fold with a load or other benefit. However, when
9040   // we'll have to do 2x as many shuffles in order to achieve this, blending
9041   // first is a better strategy.
9042   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9043     if (SDValue BlendPerm =
9044             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9045       return BlendPerm;
9046
9047   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9048   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9049   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9050 }
9051
9052 /// \brief Try to lower a vector shuffle as a rotation.
9053 ///
9054 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9055 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9056                                       ArrayRef<int> Mask) {
9057   int NumElts = Mask.size();
9058
9059   // We need to detect various ways of spelling a rotation:
9060   //   [11, 12, 13, 14, 15,  0,  1,  2]
9061   //   [-1, 12, 13, 14, -1, -1,  1, -1]
9062   //   [-1, -1, -1, -1, -1, -1,  1,  2]
9063   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
9064   //   [-1,  4,  5,  6, -1, -1,  9, -1]
9065   //   [-1,  4,  5,  6, -1, -1, -1, -1]
9066   int Rotation = 0;
9067   SDValue Lo, Hi;
9068   for (int i = 0; i < NumElts; ++i) {
9069     int M = Mask[i];
9070     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9071            "Unexpected mask index.");
9072     if (M < 0)
9073       continue;
9074
9075     // Determine where a rotated vector would have started.
9076     int StartIdx = i - (M % NumElts);
9077     if (StartIdx == 0)
9078       // The identity rotation isn't interesting, stop.
9079       return -1;
9080
9081     // If we found the tail of a vector the rotation must be the missing
9082     // front. If we found the head of a vector, it must be how much of the
9083     // head.
9084     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9085
9086     if (Rotation == 0)
9087       Rotation = CandidateRotation;
9088     else if (Rotation != CandidateRotation)
9089       // The rotations don't match, so we can't match this mask.
9090       return -1;
9091
9092     // Compute which value this mask is pointing at.
9093     SDValue MaskV = M < NumElts ? V1 : V2;
9094
9095     // Compute which of the two target values this index should be assigned
9096     // to. This reflects whether the high elements are remaining or the low
9097     // elements are remaining.
9098     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9099
9100     // Either set up this value if we've not encountered it before, or check
9101     // that it remains consistent.
9102     if (!TargetV)
9103       TargetV = MaskV;
9104     else if (TargetV != MaskV)
9105       // This may be a rotation, but it pulls from the inputs in some
9106       // unsupported interleaving.
9107       return -1;
9108   }
9109
9110   // Check that we successfully analyzed the mask, and normalize the results.
9111   assert(Rotation != 0 && "Failed to locate a viable rotation!");
9112   assert((Lo || Hi) && "Failed to find a rotated input vector!");
9113   if (!Lo)
9114     Lo = Hi;
9115   else if (!Hi)
9116     Hi = Lo;
9117
9118   V1 = Lo;
9119   V2 = Hi;
9120
9121   return Rotation;
9122 }
9123
9124 /// \brief Try to lower a vector shuffle as a byte rotation.
9125 ///
9126 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9127 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9128 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9129 /// try to generically lower a vector shuffle through such an pattern. It
9130 /// does not check for the profitability of lowering either as PALIGNR or
9131 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9132 /// This matches shuffle vectors that look like:
9133 ///
9134 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9135 ///
9136 /// Essentially it concatenates V1 and V2, shifts right by some number of
9137 /// elements, and takes the low elements as the result. Note that while this is
9138 /// specified as a *right shift* because x86 is little-endian, it is a *left
9139 /// rotate* of the vector lanes.
9140 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9141                                           ArrayRef<int> Mask) {
9142   // Don't accept any shuffles with zero elements.
9143   if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9144     return -1;
9145
9146   // PALIGNR works on 128-bit lanes.
9147   SmallVector<int, 16> RepeatedMask;
9148   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9149     return -1;
9150
9151   int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9152   if (Rotation <= 0)
9153     return -1;
9154
9155   // PALIGNR rotates bytes, so we need to scale the
9156   // rotation based on how many bytes are in the vector lane.
9157   int NumElts = RepeatedMask.size();
9158   int Scale = 16 / NumElts;
9159   return Rotation * Scale;
9160 }
9161
9162 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9163                                               SDValue V1, SDValue V2,
9164                                               ArrayRef<int> Mask,
9165                                               const X86Subtarget &Subtarget,
9166                                               SelectionDAG &DAG) {
9167   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9168
9169   SDValue Lo = V1, Hi = V2;
9170   int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9171   if (ByteRotation <= 0)
9172     return SDValue();
9173
9174   // Cast the inputs to i8 vector of correct length to match PALIGNR or
9175   // PSLLDQ/PSRLDQ.
9176   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9177   Lo = DAG.getBitcast(ByteVT, Lo);
9178   Hi = DAG.getBitcast(ByteVT, Hi);
9179
9180   // SSSE3 targets can use the palignr instruction.
9181   if (Subtarget.hasSSSE3()) {
9182     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9183            "512-bit PALIGNR requires BWI instructions");
9184     return DAG.getBitcast(
9185         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9186                         DAG.getConstant(ByteRotation, DL, MVT::i8)));
9187   }
9188
9189   assert(VT.is128BitVector() &&
9190          "Rotate-based lowering only supports 128-bit lowering!");
9191   assert(Mask.size() <= 16 &&
9192          "Can shuffle at most 16 bytes in a 128-bit vector!");
9193   assert(ByteVT == MVT::v16i8 &&
9194          "SSE2 rotate lowering only needed for v16i8!");
9195
9196   // Default SSE2 implementation
9197   int LoByteShift = 16 - ByteRotation;
9198   int HiByteShift = ByteRotation;
9199
9200   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9201                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
9202   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9203                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
9204   return DAG.getBitcast(VT,
9205                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9206 }
9207
9208 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9209 ///
9210 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9211 /// rotation of the concatenation of two vectors; This routine will
9212 /// try to generically lower a vector shuffle through such an pattern.
9213 ///
9214 /// Essentially it concatenates V1 and V2, shifts right by some number of
9215 /// elements, and takes the low elements as the result. Note that while this is
9216 /// specified as a *right shift* because x86 is little-endian, it is a *left
9217 /// rotate* of the vector lanes.
9218 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9219                                           SDValue V1, SDValue V2,
9220                                           ArrayRef<int> Mask,
9221                                           const X86Subtarget &Subtarget,
9222                                           SelectionDAG &DAG) {
9223   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9224          "Only 32-bit and 64-bit elements are supported!");
9225
9226   // 128/256-bit vectors are only supported with VLX.
9227   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9228          && "VLX required for 128/256-bit vectors");
9229
9230   SDValue Lo = V1, Hi = V2;
9231   int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9232   if (Rotation <= 0)
9233     return SDValue();
9234
9235   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9236                      DAG.getConstant(Rotation, DL, MVT::i8));
9237 }
9238
9239 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9240 ///
9241 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9242 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9243 /// matches elements from one of the input vectors shuffled to the left or
9244 /// right with zeroable elements 'shifted in'. It handles both the strictly
9245 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9246 /// quad word lane.
9247 ///
9248 /// PSHL : (little-endian) left bit shift.
9249 /// [ zz, 0, zz,  2 ]
9250 /// [ -1, 4, zz, -1 ]
9251 /// PSRL : (little-endian) right bit shift.
9252 /// [  1, zz,  3, zz]
9253 /// [ -1, -1,  7, zz]
9254 /// PSLLDQ : (little-endian) left byte shift
9255 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
9256 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
9257 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
9258 /// PSRLDQ : (little-endian) right byte shift
9259 /// [  5, 6,  7, zz, zz, zz, zz, zz]
9260 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
9261 /// [  1, 2, -1, -1, -1, -1, zz, zz]
9262 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9263                                      unsigned ScalarSizeInBits,
9264                                      ArrayRef<int> Mask, int MaskOffset,
9265                                      const APInt &Zeroable,
9266                                      const X86Subtarget &Subtarget) {
9267   int Size = Mask.size();
9268   unsigned SizeInBits = Size * ScalarSizeInBits;
9269
9270   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9271     for (int i = 0; i < Size; i += Scale)
9272       for (int j = 0; j < Shift; ++j)
9273         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9274           return false;
9275
9276     return true;
9277   };
9278
9279   auto MatchShift = [&](int Shift, int Scale, bool Left) {
9280     for (int i = 0; i != Size; i += Scale) {
9281       unsigned Pos = Left ? i + Shift : i;
9282       unsigned Low = Left ? i : i + Shift;
9283       unsigned Len = Scale - Shift;
9284       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9285         return -1;
9286     }
9287
9288     int ShiftEltBits = ScalarSizeInBits * Scale;
9289     bool ByteShift = ShiftEltBits > 64;
9290     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9291                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9292     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9293
9294     // Normalize the scale for byte shifts to still produce an i64 element
9295     // type.
9296     Scale = ByteShift ? Scale / 2 : Scale;
9297
9298     // We need to round trip through the appropriate type for the shift.
9299     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9300     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9301                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
9302     return (int)ShiftAmt;
9303   };
9304
9305   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9306   // keep doubling the size of the integer elements up to that. We can
9307   // then shift the elements of the integer vector by whole multiples of
9308   // their width within the elements of the larger integer vector. Test each
9309   // multiple to see if we can find a match with the moved element indices
9310   // and that the shifted in elements are all zeroable.
9311   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9312   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9313     for (int Shift = 1; Shift != Scale; ++Shift)
9314       for (bool Left : {true, false})
9315         if (CheckZeros(Shift, Scale, Left)) {
9316           int ShiftAmt = MatchShift(Shift, Scale, Left);
9317           if (0 < ShiftAmt)
9318             return ShiftAmt;
9319         }
9320
9321   // no match
9322   return -1;
9323 }
9324
9325 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9326                                          SDValue V2, ArrayRef<int> Mask,
9327                                          const APInt &Zeroable,
9328                                          const X86Subtarget &Subtarget,
9329                                          SelectionDAG &DAG) {
9330   int Size = Mask.size();
9331   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9332
9333   MVT ShiftVT;
9334   SDValue V = V1;
9335   unsigned Opcode;
9336
9337   // Try to match shuffle against V1 shift.
9338   int ShiftAmt = matchVectorShuffleAsShift(
9339       ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9340
9341   // If V1 failed, try to match shuffle against V2 shift.
9342   if (ShiftAmt < 0) {
9343     ShiftAmt =
9344         matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9345                                   Mask, Size, Zeroable, Subtarget);
9346     V = V2;
9347   }
9348
9349   if (ShiftAmt < 0)
9350     return SDValue();
9351
9352   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9353          "Illegal integer vector type");
9354   V = DAG.getBitcast(ShiftVT, V);
9355   V = DAG.getNode(Opcode, DL, ShiftVT, V,
9356                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
9357   return DAG.getBitcast(VT, V);
9358 }
9359
9360 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9361 // Remainder of lower half result is zero and upper half is all undef.
9362 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9363                                       ArrayRef<int> Mask, uint64_t &BitLen,
9364                                       uint64_t &BitIdx, const APInt &Zeroable) {
9365   int Size = Mask.size();
9366   int HalfSize = Size / 2;
9367   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9368   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9369
9370   // Upper half must be undefined.
9371   if (!isUndefInRange(Mask, HalfSize, HalfSize))
9372     return false;
9373
9374   // Determine the extraction length from the part of the
9375   // lower half that isn't zeroable.
9376   int Len = HalfSize;
9377   for (; Len > 0; --Len)
9378     if (!Zeroable[Len - 1])
9379       break;
9380   assert(Len > 0 && "Zeroable shuffle mask");
9381
9382   // Attempt to match first Len sequential elements from the lower half.
9383   SDValue Src;
9384   int Idx = -1;
9385   for (int i = 0; i != Len; ++i) {
9386     int M = Mask[i];
9387     if (M == SM_SentinelUndef)
9388       continue;
9389     SDValue &V = (M < Size ? V1 : V2);
9390     M = M % Size;
9391
9392     // The extracted elements must start at a valid index and all mask
9393     // elements must be in the lower half.
9394     if (i > M || M >= HalfSize)
9395       return false;
9396
9397     if (Idx < 0 || (Src == V && Idx == (M - i))) {
9398       Src = V;
9399       Idx = M - i;
9400       continue;
9401     }
9402     return false;
9403   }
9404
9405   if (!Src || Idx < 0)
9406     return false;
9407
9408   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9409   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9410   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9411   V1 = Src;
9412   return true;
9413 }
9414
9415 // INSERTQ: Extract lowest Len elements from lower half of second source and
9416 // insert over first source, starting at Idx.
9417 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9418 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9419                                         ArrayRef<int> Mask, uint64_t &BitLen,
9420                                         uint64_t &BitIdx) {
9421   int Size = Mask.size();
9422   int HalfSize = Size / 2;
9423   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9424
9425   // Upper half must be undefined.
9426   if (!isUndefInRange(Mask, HalfSize, HalfSize))
9427     return false;
9428
9429   for (int Idx = 0; Idx != HalfSize; ++Idx) {
9430     SDValue Base;
9431
9432     // Attempt to match first source from mask before insertion point.
9433     if (isUndefInRange(Mask, 0, Idx)) {
9434       /* EMPTY */
9435     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9436       Base = V1;
9437     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9438       Base = V2;
9439     } else {
9440       continue;
9441     }
9442
9443     // Extend the extraction length looking to match both the insertion of
9444     // the second source and the remaining elements of the first.
9445     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9446       SDValue Insert;
9447       int Len = Hi - Idx;
9448
9449       // Match insertion.
9450       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9451         Insert = V1;
9452       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9453         Insert = V2;
9454       } else {
9455         continue;
9456       }
9457
9458       // Match the remaining elements of the lower half.
9459       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9460         /* EMPTY */
9461       } else if ((!Base || (Base == V1)) &&
9462                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9463         Base = V1;
9464       } else if ((!Base || (Base == V2)) &&
9465                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9466                                             Size + Hi)) {
9467         Base = V2;
9468       } else {
9469         continue;
9470       }
9471
9472       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9473       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9474       V1 = Base;
9475       V2 = Insert;
9476       return true;
9477     }
9478   }
9479
9480   return false;
9481 }
9482
9483 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9484 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9485                                            SDValue V2, ArrayRef<int> Mask,
9486                                            const APInt &Zeroable,
9487                                            SelectionDAG &DAG) {
9488   uint64_t BitLen, BitIdx;
9489   if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9490     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9491                        DAG.getConstant(BitLen, DL, MVT::i8),
9492                        DAG.getConstant(BitIdx, DL, MVT::i8));
9493
9494   if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9495     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9496                        V2 ? V2 : DAG.getUNDEF(VT),
9497                        DAG.getConstant(BitLen, DL, MVT::i8),
9498                        DAG.getConstant(BitIdx, DL, MVT::i8));
9499
9500   return SDValue();
9501 }
9502
9503 /// \brief Lower a vector shuffle as a zero or any extension.
9504 ///
9505 /// Given a specific number of elements, element bit width, and extension
9506 /// stride, produce either a zero or any extension based on the available
9507 /// features of the subtarget. The extended elements are consecutive and
9508 /// begin and can start from an offsetted element index in the input; to
9509 /// avoid excess shuffling the offset must either being in the bottom lane
9510 /// or at the start of a higher lane. All extended elements must be from
9511 /// the same lane.
9512 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9513     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9514     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9515   assert(Scale > 1 && "Need a scale to extend.");
9516   int EltBits = VT.getScalarSizeInBits();
9517   int NumElements = VT.getVectorNumElements();
9518   int NumEltsPerLane = 128 / EltBits;
9519   int OffsetLane = Offset / NumEltsPerLane;
9520   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9521          "Only 8, 16, and 32 bit elements can be extended.");
9522   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9523   assert(0 <= Offset && "Extension offset must be positive.");
9524   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9525          "Extension offset must be in the first lane or start an upper lane.");
9526
9527   // Check that an index is in same lane as the base offset.
9528   auto SafeOffset = [&](int Idx) {
9529     return OffsetLane == (Idx / NumEltsPerLane);
9530   };
9531
9532   // Shift along an input so that the offset base moves to the first element.
9533   auto ShuffleOffset = [&](SDValue V) {
9534     if (!Offset)
9535       return V;
9536
9537     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9538     for (int i = 0; i * Scale < NumElements; ++i) {
9539       int SrcIdx = i + Offset;
9540       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9541     }
9542     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9543   };
9544
9545   // Found a valid zext mask! Try various lowering strategies based on the
9546   // input type and available ISA extensions.
9547   if (Subtarget.hasSSE41()) {
9548     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9549     // PUNPCK will catch this in a later shuffle match.
9550     if (Offset && Scale == 2 && VT.is128BitVector())
9551       return SDValue();
9552     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9553                                  NumElements / Scale);
9554     InputV = ShuffleOffset(InputV);
9555     InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9556     return DAG.getBitcast(VT, InputV);
9557   }
9558
9559   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9560
9561   // For any extends we can cheat for larger element sizes and use shuffle
9562   // instructions that can fold with a load and/or copy.
9563   if (AnyExt && EltBits == 32) {
9564     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9565                          -1};
9566     return DAG.getBitcast(
9567         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9568                         DAG.getBitcast(MVT::v4i32, InputV),
9569                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9570   }
9571   if (AnyExt && EltBits == 16 && Scale > 2) {
9572     int PSHUFDMask[4] = {Offset / 2, -1,
9573                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9574     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9575                          DAG.getBitcast(MVT::v4i32, InputV),
9576                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9577     int PSHUFWMask[4] = {1, -1, -1, -1};
9578     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9579     return DAG.getBitcast(
9580         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9581                         DAG.getBitcast(MVT::v8i16, InputV),
9582                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9583   }
9584
9585   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9586   // to 64-bits.
9587   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9588     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9589     assert(VT.is128BitVector() && "Unexpected vector width!");
9590
9591     int LoIdx = Offset * EltBits;
9592     SDValue Lo = DAG.getBitcast(
9593         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9594                                 DAG.getConstant(EltBits, DL, MVT::i8),
9595                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
9596
9597     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9598         !SafeOffset(Offset + 1))
9599       return DAG.getBitcast(VT, Lo);
9600
9601     int HiIdx = (Offset + 1) * EltBits;
9602     SDValue Hi = DAG.getBitcast(
9603         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9604                                 DAG.getConstant(EltBits, DL, MVT::i8),
9605                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
9606     return DAG.getBitcast(VT,
9607                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9608   }
9609
9610   // If this would require more than 2 unpack instructions to expand, use
9611   // pshufb when available. We can only use more than 2 unpack instructions
9612   // when zero extending i8 elements which also makes it easier to use pshufb.
9613   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9614     assert(NumElements == 16 && "Unexpected byte vector width!");
9615     SDValue PSHUFBMask[16];
9616     for (int i = 0; i < 16; ++i) {
9617       int Idx = Offset + (i / Scale);
9618       PSHUFBMask[i] = DAG.getConstant(
9619           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9620     }
9621     InputV = DAG.getBitcast(MVT::v16i8, InputV);
9622     return DAG.getBitcast(
9623         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9624                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9625   }
9626
9627   // If we are extending from an offset, ensure we start on a boundary that
9628   // we can unpack from.
9629   int AlignToUnpack = Offset % (NumElements / Scale);
9630   if (AlignToUnpack) {
9631     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9632     for (int i = AlignToUnpack; i < NumElements; ++i)
9633       ShMask[i - AlignToUnpack] = i;
9634     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9635     Offset -= AlignToUnpack;
9636   }
9637
9638   // Otherwise emit a sequence of unpacks.
9639   do {
9640     unsigned UnpackLoHi = X86ISD::UNPCKL;
9641     if (Offset >= (NumElements / 2)) {
9642       UnpackLoHi = X86ISD::UNPCKH;
9643       Offset -= (NumElements / 2);
9644     }
9645
9646     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9647     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9648                          : getZeroVector(InputVT, Subtarget, DAG, DL);
9649     InputV = DAG.getBitcast(InputVT, InputV);
9650     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9651     Scale /= 2;
9652     EltBits *= 2;
9653     NumElements /= 2;
9654   } while (Scale > 1);
9655   return DAG.getBitcast(VT, InputV);
9656 }
9657
9658 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9659 ///
9660 /// This routine will try to do everything in its power to cleverly lower
9661 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9662 /// check for the profitability of this lowering,  it tries to aggressively
9663 /// match this pattern. It will use all of the micro-architectural details it
9664 /// can to emit an efficient lowering. It handles both blends with all-zero
9665 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9666 /// masking out later).
9667 ///
9668 /// The reason we have dedicated lowering for zext-style shuffles is that they
9669 /// are both incredibly common and often quite performance sensitive.
9670 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9671     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9672     const APInt &Zeroable, const X86Subtarget &Subtarget,
9673     SelectionDAG &DAG) {
9674   int Bits = VT.getSizeInBits();
9675   int NumLanes = Bits / 128;
9676   int NumElements = VT.getVectorNumElements();
9677   int NumEltsPerLane = NumElements / NumLanes;
9678   assert(VT.getScalarSizeInBits() <= 32 &&
9679          "Exceeds 32-bit integer zero extension limit");
9680   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9681
9682   // Define a helper function to check a particular ext-scale and lower to it if
9683   // valid.
9684   auto Lower = [&](int Scale) -> SDValue {
9685     SDValue InputV;
9686     bool AnyExt = true;
9687     int Offset = 0;
9688     int Matches = 0;
9689     for (int i = 0; i < NumElements; ++i) {
9690       int M = Mask[i];
9691       if (M < 0)
9692         continue; // Valid anywhere but doesn't tell us anything.
9693       if (i % Scale != 0) {
9694         // Each of the extended elements need to be zeroable.
9695         if (!Zeroable[i])
9696           return SDValue();
9697
9698         // We no longer are in the anyext case.
9699         AnyExt = false;
9700         continue;
9701       }
9702
9703       // Each of the base elements needs to be consecutive indices into the
9704       // same input vector.
9705       SDValue V = M < NumElements ? V1 : V2;
9706       M = M % NumElements;
9707       if (!InputV) {
9708         InputV = V;
9709         Offset = M - (i / Scale);
9710       } else if (InputV != V)
9711         return SDValue(); // Flip-flopping inputs.
9712
9713       // Offset must start in the lowest 128-bit lane or at the start of an
9714       // upper lane.
9715       // FIXME: Is it ever worth allowing a negative base offset?
9716       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9717             (Offset % NumEltsPerLane) == 0))
9718         return SDValue();
9719
9720       // If we are offsetting, all referenced entries must come from the same
9721       // lane.
9722       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9723         return SDValue();
9724
9725       if ((M % NumElements) != (Offset + (i / Scale)))
9726         return SDValue(); // Non-consecutive strided elements.
9727       Matches++;
9728     }
9729
9730     // If we fail to find an input, we have a zero-shuffle which should always
9731     // have already been handled.
9732     // FIXME: Maybe handle this here in case during blending we end up with one?
9733     if (!InputV)
9734       return SDValue();
9735
9736     // If we are offsetting, don't extend if we only match a single input, we
9737     // can always do better by using a basic PSHUF or PUNPCK.
9738     if (Offset != 0 && Matches < 2)
9739       return SDValue();
9740
9741     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9742         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9743   };
9744
9745   // The widest scale possible for extending is to a 64-bit integer.
9746   assert(Bits % 64 == 0 &&
9747          "The number of bits in a vector must be divisible by 64 on x86!");
9748   int NumExtElements = Bits / 64;
9749
9750   // Each iteration, try extending the elements half as much, but into twice as
9751   // many elements.
9752   for (; NumExtElements < NumElements; NumExtElements *= 2) {
9753     assert(NumElements % NumExtElements == 0 &&
9754            "The input vector size must be divisible by the extended size.");
9755     if (SDValue V = Lower(NumElements / NumExtElements))
9756       return V;
9757   }
9758
9759   // General extends failed, but 128-bit vectors may be able to use MOVQ.
9760   if (Bits != 128)
9761     return SDValue();
9762
9763   // Returns one of the source operands if the shuffle can be reduced to a
9764   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9765   auto CanZExtLowHalf = [&]() {
9766     for (int i = NumElements / 2; i != NumElements; ++i)
9767       if (!Zeroable[i])
9768         return SDValue();
9769     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9770       return V1;
9771     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9772       return V2;
9773     return SDValue();
9774   };
9775
9776   if (SDValue V = CanZExtLowHalf()) {
9777     V = DAG.getBitcast(MVT::v2i64, V);
9778     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9779     return DAG.getBitcast(VT, V);
9780   }
9781
9782   // No viable ext lowering found.
9783   return SDValue();
9784 }
9785
9786 /// \brief Try to get a scalar value for a specific element of a vector.
9787 ///
9788 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9789 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9790                                               SelectionDAG &DAG) {
9791   MVT VT = V.getSimpleValueType();
9792   MVT EltVT = VT.getVectorElementType();
9793   V = peekThroughBitcasts(V);
9794
9795   // If the bitcasts shift the element size, we can't extract an equivalent
9796   // element from it.
9797   MVT NewVT = V.getSimpleValueType();
9798   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9799     return SDValue();
9800
9801   if (V.getOpcode() == ISD::BUILD_VECTOR ||
9802       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9803     // Ensure the scalar operand is the same size as the destination.
9804     // FIXME: Add support for scalar truncation where possible.
9805     SDValue S = V.getOperand(Idx);
9806     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9807       return DAG.getBitcast(EltVT, S);
9808   }
9809
9810   return SDValue();
9811 }
9812
9813 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9814 ///
9815 /// This is particularly important because the set of instructions varies
9816 /// significantly based on whether the operand is a load or not.
9817 static bool isShuffleFoldableLoad(SDValue V) {
9818   V = peekThroughBitcasts(V);
9819   return ISD::isNON_EXTLoad(V.getNode());
9820 }
9821
9822 /// \brief Try to lower insertion of a single element into a zero vector.
9823 ///
9824 /// This is a common pattern that we have especially efficient patterns to lower
9825 /// across all subtarget feature sets.
9826 static SDValue lowerVectorShuffleAsElementInsertion(
9827     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9828     const APInt &Zeroable, const X86Subtarget &Subtarget,
9829     SelectionDAG &DAG) {
9830   MVT ExtVT = VT;
9831   MVT EltVT = VT.getVectorElementType();
9832
9833   int V2Index =
9834       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9835       Mask.begin();
9836   bool IsV1Zeroable = true;
9837   for (int i = 0, Size = Mask.size(); i < Size; ++i)
9838     if (i != V2Index && !Zeroable[i]) {
9839       IsV1Zeroable = false;
9840       break;
9841     }
9842
9843   // Check for a single input from a SCALAR_TO_VECTOR node.
9844   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9845   // all the smarts here sunk into that routine. However, the current
9846   // lowering of BUILD_VECTOR makes that nearly impossible until the old
9847   // vector shuffle lowering is dead.
9848   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9849                                                DAG);
9850   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9851     // We need to zext the scalar if it is smaller than an i32.
9852     V2S = DAG.getBitcast(EltVT, V2S);
9853     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9854       // Using zext to expand a narrow element won't work for non-zero
9855       // insertions.
9856       if (!IsV1Zeroable)
9857         return SDValue();
9858
9859       // Zero-extend directly to i32.
9860       ExtVT = MVT::v4i32;
9861       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9862     }
9863     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9864   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9865              EltVT == MVT::i16) {
9866     // Either not inserting from the low element of the input or the input
9867     // element size is too small to use VZEXT_MOVL to clear the high bits.
9868     return SDValue();
9869   }
9870
9871   if (!IsV1Zeroable) {
9872     // If V1 can't be treated as a zero vector we have fewer options to lower
9873     // this. We can't support integer vectors or non-zero targets cheaply, and
9874     // the V1 elements can't be permuted in any way.
9875     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9876     if (!VT.isFloatingPoint() || V2Index != 0)
9877       return SDValue();
9878     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9879     V1Mask[V2Index] = -1;
9880     if (!isNoopShuffleMask(V1Mask))
9881       return SDValue();
9882     // This is essentially a special case blend operation, but if we have
9883     // general purpose blend operations, they are always faster. Bail and let
9884     // the rest of the lowering handle these as blends.
9885     if (Subtarget.hasSSE41())
9886       return SDValue();
9887
9888     // Otherwise, use MOVSD or MOVSS.
9889     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9890            "Only two types of floating point element types to handle!");
9891     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9892                        ExtVT, V1, V2);
9893   }
9894
9895   // This lowering only works for the low element with floating point vectors.
9896   if (VT.isFloatingPoint() && V2Index != 0)
9897     return SDValue();
9898
9899   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9900   if (ExtVT != VT)
9901     V2 = DAG.getBitcast(VT, V2);
9902
9903   if (V2Index != 0) {
9904     // If we have 4 or fewer lanes we can cheaply shuffle the element into
9905     // the desired position. Otherwise it is more efficient to do a vector
9906     // shift left. We know that we can do a vector shift left because all
9907     // the inputs are zero.
9908     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9909       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9910       V2Shuffle[V2Index] = 0;
9911       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9912     } else {
9913       V2 = DAG.getBitcast(MVT::v16i8, V2);
9914       V2 = DAG.getNode(
9915           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9916           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9917                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9918                               DAG.getDataLayout(), VT)));
9919       V2 = DAG.getBitcast(VT, V2);
9920     }
9921   }
9922   return V2;
9923 }
9924
9925 /// Try to lower broadcast of a single - truncated - integer element,
9926 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9927 ///
9928 /// This assumes we have AVX2.
9929 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9930                                                   SDValue V0, int BroadcastIdx,
9931                                                   const X86Subtarget &Subtarget,
9932                                                   SelectionDAG &DAG) {
9933   assert(Subtarget.hasAVX2() &&
9934          "We can only lower integer broadcasts with AVX2!");
9935
9936   EVT EltVT = VT.getVectorElementType();
9937   EVT V0VT = V0.getValueType();
9938
9939   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9940   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9941
9942   EVT V0EltVT = V0VT.getVectorElementType();
9943   if (!V0EltVT.isInteger())
9944     return SDValue();
9945
9946   const unsigned EltSize = EltVT.getSizeInBits();
9947   const unsigned V0EltSize = V0EltVT.getSizeInBits();
9948
9949   // This is only a truncation if the original element type is larger.
9950   if (V0EltSize <= EltSize)
9951     return SDValue();
9952
9953   assert(((V0EltSize % EltSize) == 0) &&
9954          "Scalar type sizes must all be powers of 2 on x86!");
9955
9956   const unsigned V0Opc = V0.getOpcode();
9957   const unsigned Scale = V0EltSize / EltSize;
9958   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9959
9960   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9961       V0Opc != ISD::BUILD_VECTOR)
9962     return SDValue();
9963
9964   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9965
9966   // If we're extracting non-least-significant bits, shift so we can truncate.
9967   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9968   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9969   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9970   if (const int OffsetIdx = BroadcastIdx % Scale)
9971     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9972             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9973
9974   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9975                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9976 }
9977
9978 /// \brief Try to lower broadcast of a single element.
9979 ///
9980 /// For convenience, this code also bundles all of the subtarget feature set
9981 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9982 /// a convenient way to factor it out.
9983 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9984                                              SDValue V1, SDValue V2,
9985                                              ArrayRef<int> Mask,
9986                                              const X86Subtarget &Subtarget,
9987                                              SelectionDAG &DAG) {
9988   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9989         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9990         (Subtarget.hasAVX2() && VT.isInteger())))
9991     return SDValue();
9992
9993   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9994   // we can only broadcast from a register with AVX2.
9995   unsigned NumElts = Mask.size();
9996   unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9997   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9998
9999   // Check that the mask is a broadcast.
10000   int BroadcastIdx = -1;
10001   for (int i = 0; i != (int)NumElts; ++i) {
10002     SmallVector<int, 8> BroadcastMask(NumElts, i);
10003     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10004       BroadcastIdx = i;
10005       break;
10006     }
10007   }
10008
10009   if (BroadcastIdx < 0)
10010     return SDValue();
10011   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10012                                             "a sorted mask where the broadcast "
10013                                             "comes from V1.");
10014
10015   // Go up the chain of (vector) values to find a scalar load that we can
10016   // combine with the broadcast.
10017   SDValue V = V1;
10018   for (;;) {
10019     switch (V.getOpcode()) {
10020     case ISD::BITCAST: {
10021       SDValue VSrc = V.getOperand(0);
10022       MVT SrcVT = VSrc.getSimpleValueType();
10023       if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10024         break;
10025       V = VSrc;
10026       continue;
10027     }
10028     case ISD::CONCAT_VECTORS: {
10029       int OperandSize = Mask.size() / V.getNumOperands();
10030       V = V.getOperand(BroadcastIdx / OperandSize);
10031       BroadcastIdx %= OperandSize;
10032       continue;
10033     }
10034     case ISD::INSERT_SUBVECTOR: {
10035       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10036       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10037       if (!ConstantIdx)
10038         break;
10039
10040       int BeginIdx = (int)ConstantIdx->getZExtValue();
10041       int EndIdx =
10042           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10043       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10044         BroadcastIdx -= BeginIdx;
10045         V = VInner;
10046       } else {
10047         V = VOuter;
10048       }
10049       continue;
10050     }
10051     }
10052     break;
10053   }
10054
10055   // Check if this is a broadcast of a scalar. We special case lowering
10056   // for scalars so that we can more effectively fold with loads.
10057   // First, look through bitcast: if the original value has a larger element
10058   // type than the shuffle, the broadcast element is in essence truncated.
10059   // Make that explicit to ease folding.
10060   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10061     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10062             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10063       return TruncBroadcast;
10064
10065   MVT BroadcastVT = VT;
10066
10067   // Peek through any bitcast (only useful for loads).
10068   SDValue BC = peekThroughBitcasts(V);
10069
10070   // Also check the simpler case, where we can directly reuse the scalar.
10071   if (V.getOpcode() == ISD::BUILD_VECTOR ||
10072       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10073     V = V.getOperand(BroadcastIdx);
10074
10075     // If we can't broadcast from a register, check that the input is a load.
10076     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10077       return SDValue();
10078   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10079     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10080     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10081       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10082       Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
10083     }
10084
10085     // If we are broadcasting a load that is only used by the shuffle
10086     // then we can reduce the vector load to the broadcasted scalar load.
10087     LoadSDNode *Ld = cast<LoadSDNode>(BC);
10088     SDValue BaseAddr = Ld->getOperand(1);
10089     EVT SVT = BroadcastVT.getScalarType();
10090     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10091     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10092     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10093                     DAG.getMachineFunction().getMachineMemOperand(
10094                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10095     DAG.makeEquivalentMemoryOrdering(Ld, V);
10096   } else if (!BroadcastFromReg) {
10097     // We can't broadcast from a vector register.
10098     return SDValue();
10099   } else if (BroadcastIdx != 0) {
10100     // We can only broadcast from the zero-element of a vector register,
10101     // but it can be advantageous to broadcast from the zero-element of a
10102     // subvector.
10103     if (!VT.is256BitVector() && !VT.is512BitVector())
10104       return SDValue();
10105
10106     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10107     if (VT == MVT::v4f64 || VT == MVT::v4i64)
10108       return SDValue();
10109
10110     // Only broadcast the zero-element of a 128-bit subvector.
10111     unsigned EltSize = VT.getScalarSizeInBits();
10112     if (((BroadcastIdx * EltSize) % 128) != 0)
10113       return SDValue();
10114
10115     // The shuffle input might have been a bitcast we looked through; look at
10116     // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
10117     // later bitcast it to BroadcastVT.
10118     MVT SrcVT = V.getSimpleValueType();
10119     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10120            "Unexpected vector element size");
10121     assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10122            "Unexpected vector size");
10123
10124     MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10125     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10126                     DAG.getIntPtrConstant(BroadcastIdx, DL));
10127   }
10128
10129   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10130     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10131                     DAG.getBitcast(MVT::f64, V));
10132
10133   // Bitcast back to the same scalar type as BroadcastVT.
10134   MVT SrcVT = V.getSimpleValueType();
10135   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10136     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10137            "Unexpected vector element size");
10138     if (SrcVT.isVector()) {
10139       unsigned NumSrcElts = SrcVT.getVectorNumElements();
10140       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10141     } else {
10142       SrcVT = BroadcastVT.getScalarType();
10143     }
10144     V = DAG.getBitcast(SrcVT, V);
10145   }
10146
10147   // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10148   if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10149     V = DAG.getBitcast(MVT::f64, V);
10150     unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10151     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10152   }
10153
10154   // We only support broadcasting from 128-bit vectors to minimize the
10155   // number of patterns we need to deal with in isel. So extract down to
10156   // 128-bits.
10157   if (SrcVT.getSizeInBits() > 128)
10158     V = extract128BitVector(V, 0, DAG, DL);
10159
10160   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10161 }
10162
10163 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10164 // INSERTPS when the V1 elements are already in the correct locations
10165 // because otherwise we can just always use two SHUFPS instructions which
10166 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10167 // perform INSERTPS if a single V1 element is out of place and all V2
10168 // elements are zeroable.
10169 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10170                                          unsigned &InsertPSMask,
10171                                          const APInt &Zeroable,
10172                                          ArrayRef<int> Mask,
10173                                          SelectionDAG &DAG) {
10174   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10175   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10176   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10177
10178   // Attempt to match INSERTPS with one element from VA or VB being
10179   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10180   // are updated.
10181   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10182                              ArrayRef<int> CandidateMask) {
10183     unsigned ZMask = 0;
10184     int VADstIndex = -1;
10185     int VBDstIndex = -1;
10186     bool VAUsedInPlace = false;
10187
10188     for (int i = 0; i < 4; ++i) {
10189       // Synthesize a zero mask from the zeroable elements (includes undefs).
10190       if (Zeroable[i]) {
10191         ZMask |= 1 << i;
10192         continue;
10193       }
10194
10195       // Flag if we use any VA inputs in place.
10196       if (i == CandidateMask[i]) {
10197         VAUsedInPlace = true;
10198         continue;
10199       }
10200
10201       // We can only insert a single non-zeroable element.
10202       if (VADstIndex >= 0 || VBDstIndex >= 0)
10203         return false;
10204
10205       if (CandidateMask[i] < 4) {
10206         // VA input out of place for insertion.
10207         VADstIndex = i;
10208       } else {
10209         // VB input for insertion.
10210         VBDstIndex = i;
10211       }
10212     }
10213
10214     // Don't bother if we have no (non-zeroable) element for insertion.
10215     if (VADstIndex < 0 && VBDstIndex < 0)
10216       return false;
10217
10218     // Determine element insertion src/dst indices. The src index is from the
10219     // start of the inserted vector, not the start of the concatenated vector.
10220     unsigned VBSrcIndex = 0;
10221     if (VADstIndex >= 0) {
10222       // If we have a VA input out of place, we use VA as the V2 element
10223       // insertion and don't use the original V2 at all.
10224       VBSrcIndex = CandidateMask[VADstIndex];
10225       VBDstIndex = VADstIndex;
10226       VB = VA;
10227     } else {
10228       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10229     }
10230
10231     // If no V1 inputs are used in place, then the result is created only from
10232     // the zero mask and the V2 insertion - so remove V1 dependency.
10233     if (!VAUsedInPlace)
10234       VA = DAG.getUNDEF(MVT::v4f32);
10235
10236     // Update V1, V2 and InsertPSMask accordingly.
10237     V1 = VA;
10238     V2 = VB;
10239
10240     // Insert the V2 element into the desired position.
10241     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10242     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10243     return true;
10244   };
10245
10246   if (matchAsInsertPS(V1, V2, Mask))
10247     return true;
10248
10249   // Commute and try again.
10250   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10251   ShuffleVectorSDNode::commuteMask(CommutedMask);
10252   if (matchAsInsertPS(V2, V1, CommutedMask))
10253     return true;
10254
10255   return false;
10256 }
10257
10258 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10259                                             SDValue V2, ArrayRef<int> Mask,
10260                                             const APInt &Zeroable,
10261                                             SelectionDAG &DAG) {
10262   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10263   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10264
10265   // Attempt to match the insertps pattern.
10266   unsigned InsertPSMask;
10267   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10268     return SDValue();
10269
10270   // Insert the V2 element into the desired position.
10271   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10272                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
10273 }
10274
10275 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10276 /// UNPCK instruction.
10277 ///
10278 /// This specifically targets cases where we end up with alternating between
10279 /// the two inputs, and so can permute them into something that feeds a single
10280 /// UNPCK instruction. Note that this routine only targets integer vectors
10281 /// because for floating point vectors we have a generalized SHUFPS lowering
10282 /// strategy that handles everything that doesn't *exactly* match an unpack,
10283 /// making this clever lowering unnecessary.
10284 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10285                                                     SDValue V1, SDValue V2,
10286                                                     ArrayRef<int> Mask,
10287                                                     SelectionDAG &DAG) {
10288   assert(!VT.isFloatingPoint() &&
10289          "This routine only supports integer vectors.");
10290   assert(VT.is128BitVector() &&
10291          "This routine only works on 128-bit vectors.");
10292   assert(!V2.isUndef() &&
10293          "This routine should only be used when blending two inputs.");
10294   assert(Mask.size() >= 2 && "Single element masks are invalid.");
10295
10296   int Size = Mask.size();
10297
10298   int NumLoInputs =
10299       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10300   int NumHiInputs =
10301       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10302
10303   bool UnpackLo = NumLoInputs >= NumHiInputs;
10304
10305   auto TryUnpack = [&](int ScalarSize, int Scale) {
10306     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10307     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10308
10309     for (int i = 0; i < Size; ++i) {
10310       if (Mask[i] < 0)
10311         continue;
10312
10313       // Each element of the unpack contains Scale elements from this mask.
10314       int UnpackIdx = i / Scale;
10315
10316       // We only handle the case where V1 feeds the first slots of the unpack.
10317       // We rely on canonicalization to ensure this is the case.
10318       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10319         return SDValue();
10320
10321       // Setup the mask for this input. The indexing is tricky as we have to
10322       // handle the unpack stride.
10323       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10324       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10325           Mask[i] % Size;
10326     }
10327
10328     // If we will have to shuffle both inputs to use the unpack, check whether
10329     // we can just unpack first and shuffle the result. If so, skip this unpack.
10330     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10331         !isNoopShuffleMask(V2Mask))
10332       return SDValue();
10333
10334     // Shuffle the inputs into place.
10335     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10336     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10337
10338     // Cast the inputs to the type we will use to unpack them.
10339     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10340     V1 = DAG.getBitcast(UnpackVT, V1);
10341     V2 = DAG.getBitcast(UnpackVT, V2);
10342
10343     // Unpack the inputs and cast the result back to the desired type.
10344     return DAG.getBitcast(
10345         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10346                         UnpackVT, V1, V2));
10347   };
10348
10349   // We try each unpack from the largest to the smallest to try and find one
10350   // that fits this mask.
10351   int OrigScalarSize = VT.getScalarSizeInBits();
10352   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10353     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10354       return Unpack;
10355
10356   // If none of the unpack-rooted lowerings worked (or were profitable) try an
10357   // initial unpack.
10358   if (NumLoInputs == 0 || NumHiInputs == 0) {
10359     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10360            "We have to have *some* inputs!");
10361     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10362
10363     // FIXME: We could consider the total complexity of the permute of each
10364     // possible unpacking. Or at the least we should consider how many
10365     // half-crossings are created.
10366     // FIXME: We could consider commuting the unpacks.
10367
10368     SmallVector<int, 32> PermMask((unsigned)Size, -1);
10369     for (int i = 0; i < Size; ++i) {
10370       if (Mask[i] < 0)
10371         continue;
10372
10373       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10374
10375       PermMask[i] =
10376           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10377     }
10378     return DAG.getVectorShuffle(
10379         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10380                             DL, VT, V1, V2),
10381         DAG.getUNDEF(VT), PermMask);
10382   }
10383
10384   return SDValue();
10385 }
10386
10387 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10388 ///
10389 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10390 /// support for floating point shuffles but not integer shuffles. These
10391 /// instructions will incur a domain crossing penalty on some chips though so
10392 /// it is better to avoid lowering through this for integer vectors where
10393 /// possible.
10394 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10395                                        const APInt &Zeroable,
10396                                        SDValue V1, SDValue V2,
10397                                        const X86Subtarget &Subtarget,
10398                                        SelectionDAG &DAG) {
10399   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10400   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10401   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10402
10403   if (V2.isUndef()) {
10404     // Check for being able to broadcast a single element.
10405     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10406             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10407       return Broadcast;
10408
10409     // Straight shuffle of a single input vector. Simulate this by using the
10410     // single input as both of the "inputs" to this instruction..
10411     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10412
10413     if (Subtarget.hasAVX()) {
10414       // If we have AVX, we can use VPERMILPS which will allow folding a load
10415       // into the shuffle.
10416       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10417                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10418     }
10419
10420     return DAG.getNode(
10421         X86ISD::SHUFP, DL, MVT::v2f64,
10422         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10423         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10424         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10425   }
10426   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10427   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10428
10429   // If we have a single input, insert that into V1 if we can do so cheaply.
10430   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10431     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10432             DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10433       return Insertion;
10434     // Try inverting the insertion since for v2 masks it is easy to do and we
10435     // can't reliably sort the mask one way or the other.
10436     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10437                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10438     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10439             DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10440       return Insertion;
10441   }
10442
10443   // Try to use one of the special instruction patterns to handle two common
10444   // blend patterns if a zero-blend above didn't work.
10445   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10446       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10447     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10448       // We can either use a special instruction to load over the low double or
10449       // to move just the low double.
10450       return DAG.getNode(
10451           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10452           DL, MVT::v2f64, V2,
10453           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10454
10455   if (Subtarget.hasSSE41())
10456     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10457                                                   Zeroable, Subtarget, DAG))
10458       return Blend;
10459
10460   // Use dedicated unpack instructions for masks that match their pattern.
10461   if (SDValue V =
10462           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10463     return V;
10464
10465   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10466   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10467                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10468 }
10469
10470 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10471 ///
10472 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10473 /// the integer unit to minimize domain crossing penalties. However, for blends
10474 /// it falls back to the floating point shuffle operation with appropriate bit
10475 /// casting.
10476 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10477                                        const APInt &Zeroable,
10478                                        SDValue V1, SDValue V2,
10479                                        const X86Subtarget &Subtarget,
10480                                        SelectionDAG &DAG) {
10481   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10482   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10483   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10484
10485   if (V2.isUndef()) {
10486     // Check for being able to broadcast a single element.
10487     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10488             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10489       return Broadcast;
10490
10491     // Straight shuffle of a single input vector. For everything from SSE2
10492     // onward this has a single fast instruction with no scary immediates.
10493     // We have to map the mask as it is actually a v4i32 shuffle instruction.
10494     V1 = DAG.getBitcast(MVT::v4i32, V1);
10495     int WidenedMask[4] = {
10496         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10497         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10498     return DAG.getBitcast(
10499         MVT::v2i64,
10500         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10501                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10502   }
10503   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10504   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10505   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10506   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10507
10508   // If we have a blend of two same-type PACKUS operations and the blend aligns
10509   // with the low and high halves, we can just merge the PACKUS operations.
10510   // This is particularly important as it lets us merge shuffles that this
10511   // routine itself creates.
10512   auto GetPackNode = [](SDValue V) {
10513     V = peekThroughBitcasts(V);
10514     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10515   };
10516   if (SDValue V1Pack = GetPackNode(V1))
10517     if (SDValue V2Pack = GetPackNode(V2)) {
10518       EVT PackVT = V1Pack.getValueType();
10519       if (PackVT == V2Pack.getValueType())
10520         return DAG.getBitcast(MVT::v2i64,
10521                               DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10522                                           Mask[0] == 0 ? V1Pack.getOperand(0)
10523                                                        : V1Pack.getOperand(1),
10524                                           Mask[1] == 2 ? V2Pack.getOperand(0)
10525                                                        : V2Pack.getOperand(1)));
10526     }
10527
10528   // Try to use shift instructions.
10529   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10530                                                 Zeroable, Subtarget, DAG))
10531     return Shift;
10532
10533   // When loading a scalar and then shuffling it into a vector we can often do
10534   // the insertion cheaply.
10535   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10536           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10537     return Insertion;
10538   // Try inverting the insertion since for v2 masks it is easy to do and we
10539   // can't reliably sort the mask one way or the other.
10540   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10541   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10542           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10543     return Insertion;
10544
10545   // We have different paths for blend lowering, but they all must use the
10546   // *exact* same predicate.
10547   bool IsBlendSupported = Subtarget.hasSSE41();
10548   if (IsBlendSupported)
10549     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10550                                                   Zeroable, Subtarget, DAG))
10551       return Blend;
10552
10553   // Use dedicated unpack instructions for masks that match their pattern.
10554   if (SDValue V =
10555           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10556     return V;
10557
10558   // Try to use byte rotation instructions.
10559   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10560   if (Subtarget.hasSSSE3())
10561     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10562             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10563       return Rotate;
10564
10565   // If we have direct support for blends, we should lower by decomposing into
10566   // a permute. That will be faster than the domain cross.
10567   if (IsBlendSupported)
10568     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10569                                                       Mask, DAG);
10570
10571   // We implement this with SHUFPD which is pretty lame because it will likely
10572   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10573   // However, all the alternatives are still more cycles and newer chips don't
10574   // have this problem. It would be really nice if x86 had better shuffles here.
10575   V1 = DAG.getBitcast(MVT::v2f64, V1);
10576   V2 = DAG.getBitcast(MVT::v2f64, V2);
10577   return DAG.getBitcast(MVT::v2i64,
10578                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10579 }
10580
10581 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10582 ///
10583 /// This is used to disable more specialized lowerings when the shufps lowering
10584 /// will happen to be efficient.
10585 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10586   // This routine only handles 128-bit shufps.
10587   assert(Mask.size() == 4 && "Unsupported mask size!");
10588   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10589   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10590   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10591   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10592
10593   // To lower with a single SHUFPS we need to have the low half and high half
10594   // each requiring a single input.
10595   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10596     return false;
10597   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10598     return false;
10599
10600   return true;
10601 }
10602
10603 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10604 ///
10605 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10606 /// It makes no assumptions about whether this is the *best* lowering, it simply
10607 /// uses it.
10608 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10609                                             ArrayRef<int> Mask, SDValue V1,
10610                                             SDValue V2, SelectionDAG &DAG) {
10611   SDValue LowV = V1, HighV = V2;
10612   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10613
10614   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10615
10616   if (NumV2Elements == 1) {
10617     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10618
10619     // Compute the index adjacent to V2Index and in the same half by toggling
10620     // the low bit.
10621     int V2AdjIndex = V2Index ^ 1;
10622
10623     if (Mask[V2AdjIndex] < 0) {
10624       // Handles all the cases where we have a single V2 element and an undef.
10625       // This will only ever happen in the high lanes because we commute the
10626       // vector otherwise.
10627       if (V2Index < 2)
10628         std::swap(LowV, HighV);
10629       NewMask[V2Index] -= 4;
10630     } else {
10631       // Handle the case where the V2 element ends up adjacent to a V1 element.
10632       // To make this work, blend them together as the first step.
10633       int V1Index = V2AdjIndex;
10634       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10635       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10636                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10637
10638       // Now proceed to reconstruct the final blend as we have the necessary
10639       // high or low half formed.
10640       if (V2Index < 2) {
10641         LowV = V2;
10642         HighV = V1;
10643       } else {
10644         HighV = V2;
10645       }
10646       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10647       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10648     }
10649   } else if (NumV2Elements == 2) {
10650     if (Mask[0] < 4 && Mask[1] < 4) {
10651       // Handle the easy case where we have V1 in the low lanes and V2 in the
10652       // high lanes.
10653       NewMask[2] -= 4;
10654       NewMask[3] -= 4;
10655     } else if (Mask[2] < 4 && Mask[3] < 4) {
10656       // We also handle the reversed case because this utility may get called
10657       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10658       // arrange things in the right direction.
10659       NewMask[0] -= 4;
10660       NewMask[1] -= 4;
10661       HighV = V1;
10662       LowV = V2;
10663     } else {
10664       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10665       // trying to place elements directly, just blend them and set up the final
10666       // shuffle to place them.
10667
10668       // The first two blend mask elements are for V1, the second two are for
10669       // V2.
10670       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10671                           Mask[2] < 4 ? Mask[2] : Mask[3],
10672                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10673                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10674       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10675                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10676
10677       // Now we do a normal shuffle of V1 by giving V1 as both operands to
10678       // a blend.
10679       LowV = HighV = V1;
10680       NewMask[0] = Mask[0] < 4 ? 0 : 2;
10681       NewMask[1] = Mask[0] < 4 ? 2 : 0;
10682       NewMask[2] = Mask[2] < 4 ? 1 : 3;
10683       NewMask[3] = Mask[2] < 4 ? 3 : 1;
10684     }
10685   }
10686   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10687                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10688 }
10689
10690 /// \brief Lower 4-lane 32-bit floating point shuffles.
10691 ///
10692 /// Uses instructions exclusively from the floating point unit to minimize
10693 /// domain crossing penalties, as these are sufficient to implement all v4f32
10694 /// shuffles.
10695 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10696                                        const APInt &Zeroable,
10697                                        SDValue V1, SDValue V2,
10698                                        const X86Subtarget &Subtarget,
10699                                        SelectionDAG &DAG) {
10700   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10701   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10702   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10703
10704   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10705
10706   if (NumV2Elements == 0) {
10707     // Check for being able to broadcast a single element.
10708     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10709             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10710       return Broadcast;
10711
10712     // Use even/odd duplicate instructions for masks that match their pattern.
10713     if (Subtarget.hasSSE3()) {
10714       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10715         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10716       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10717         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10718     }
10719
10720     if (Subtarget.hasAVX()) {
10721       // If we have AVX, we can use VPERMILPS which will allow folding a load
10722       // into the shuffle.
10723       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10724                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10725     }
10726
10727     // Otherwise, use a straight shuffle of a single input vector. We pass the
10728     // input vector to both operands to simulate this with a SHUFPS.
10729     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10730                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10731   }
10732
10733   // There are special ways we can lower some single-element blends. However, we
10734   // have custom ways we can lower more complex single-element blends below that
10735   // we defer to if both this and BLENDPS fail to match, so restrict this to
10736   // when the V2 input is targeting element 0 of the mask -- that is the fast
10737   // case here.
10738   if (NumV2Elements == 1 && Mask[0] >= 4)
10739     if (SDValue V = lowerVectorShuffleAsElementInsertion(
10740             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10741       return V;
10742
10743   if (Subtarget.hasSSE41()) {
10744     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10745                                                   Zeroable, Subtarget, DAG))
10746       return Blend;
10747
10748     // Use INSERTPS if we can complete the shuffle efficiently.
10749     if (SDValue V =
10750             lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10751       return V;
10752
10753     if (!isSingleSHUFPSMask(Mask))
10754       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10755               DL, MVT::v4f32, V1, V2, Mask, DAG))
10756         return BlendPerm;
10757   }
10758
10759   // Use low/high mov instructions.
10760   if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10761     return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10762   if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10763     return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10764
10765   // Use dedicated unpack instructions for masks that match their pattern.
10766   if (SDValue V =
10767           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10768     return V;
10769
10770   // Otherwise fall back to a SHUFPS lowering strategy.
10771   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10772 }
10773
10774 /// \brief Lower 4-lane i32 vector shuffles.
10775 ///
10776 /// We try to handle these with integer-domain shuffles where we can, but for
10777 /// blends we use the floating point domain blend instructions.
10778 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10779                                        const APInt &Zeroable,
10780                                        SDValue V1, SDValue V2,
10781                                        const X86Subtarget &Subtarget,
10782                                        SelectionDAG &DAG) {
10783   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10784   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10785   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10786
10787   // Whenever we can lower this as a zext, that instruction is strictly faster
10788   // than any alternative. It also allows us to fold memory operands into the
10789   // shuffle in many cases.
10790   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10791           DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10792     return ZExt;
10793
10794   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10795
10796   if (NumV2Elements == 0) {
10797     // Check for being able to broadcast a single element.
10798     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10799             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10800       return Broadcast;
10801
10802     // Straight shuffle of a single input vector. For everything from SSE2
10803     // onward this has a single fast instruction with no scary immediates.
10804     // We coerce the shuffle pattern to be compatible with UNPCK instructions
10805     // but we aren't actually going to use the UNPCK instruction because doing
10806     // so prevents folding a load into this instruction or making a copy.
10807     const int UnpackLoMask[] = {0, 0, 1, 1};
10808     const int UnpackHiMask[] = {2, 2, 3, 3};
10809     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10810       Mask = UnpackLoMask;
10811     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10812       Mask = UnpackHiMask;
10813
10814     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10815                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10816   }
10817
10818   // Try to use shift instructions.
10819   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10820                                                 Zeroable, Subtarget, DAG))
10821     return Shift;
10822
10823   // There are special ways we can lower some single-element blends.
10824   if (NumV2Elements == 1)
10825     if (SDValue V = lowerVectorShuffleAsElementInsertion(
10826             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10827       return V;
10828
10829   // We have different paths for blend lowering, but they all must use the
10830   // *exact* same predicate.
10831   bool IsBlendSupported = Subtarget.hasSSE41();
10832   if (IsBlendSupported)
10833     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10834                                                   Zeroable, Subtarget, DAG))
10835       return Blend;
10836
10837   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10838                                                    Zeroable, DAG))
10839     return Masked;
10840
10841   // Use dedicated unpack instructions for masks that match their pattern.
10842   if (SDValue V =
10843           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10844     return V;
10845
10846   // Try to use byte rotation instructions.
10847   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10848   if (Subtarget.hasSSSE3())
10849     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10850             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10851       return Rotate;
10852
10853   // Assume that a single SHUFPS is faster than an alternative sequence of
10854   // multiple instructions (even if the CPU has a domain penalty).
10855   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10856   if (!isSingleSHUFPSMask(Mask)) {
10857     // If we have direct support for blends, we should lower by decomposing into
10858     // a permute. That will be faster than the domain cross.
10859     if (IsBlendSupported)
10860       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10861                                                         Mask, DAG);
10862
10863     // Try to lower by permuting the inputs into an unpack instruction.
10864     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10865             DL, MVT::v4i32, V1, V2, Mask, DAG))
10866       return Unpack;
10867   }
10868
10869   // We implement this with SHUFPS because it can blend from two vectors.
10870   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10871   // up the inputs, bypassing domain shift penalties that we would incur if we
10872   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10873   // relevant.
10874   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10875   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10876   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10877   return DAG.getBitcast(MVT::v4i32, ShufPS);
10878 }
10879
10880 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10881 /// shuffle lowering, and the most complex part.
10882 ///
10883 /// The lowering strategy is to try to form pairs of input lanes which are
10884 /// targeted at the same half of the final vector, and then use a dword shuffle
10885 /// to place them onto the right half, and finally unpack the paired lanes into
10886 /// their final position.
10887 ///
10888 /// The exact breakdown of how to form these dword pairs and align them on the
10889 /// correct sides is really tricky. See the comments within the function for
10890 /// more of the details.
10891 ///
10892 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10893 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10894 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10895 /// vector, form the analogous 128-bit 8-element Mask.
10896 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10897     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10898     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10899   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10900   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10901
10902   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10903   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10904   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10905
10906   SmallVector<int, 4> LoInputs;
10907   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10908   std::sort(LoInputs.begin(), LoInputs.end());
10909   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10910   SmallVector<int, 4> HiInputs;
10911   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10912   std::sort(HiInputs.begin(), HiInputs.end());
10913   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10914   int NumLToL =
10915       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10916   int NumHToL = LoInputs.size() - NumLToL;
10917   int NumLToH =
10918       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10919   int NumHToH = HiInputs.size() - NumLToH;
10920   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10921   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10922   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10923   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10924
10925   // If we are splatting two values from one half - one to each half, then
10926   // we can shuffle that half so each is splatted to a dword, then splat those
10927   // to their respective halves.
10928   auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10929                         int DOffset) {
10930     int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10931     int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10932     V = DAG.getNode(ShufWOp, DL, VT, V,
10933                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10934     V = DAG.getBitcast(PSHUFDVT, V);
10935     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10936                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10937     return DAG.getBitcast(VT, V);
10938   };
10939
10940   if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10941     return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10942   if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10943     return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10944
10945   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10946   // such inputs we can swap two of the dwords across the half mark and end up
10947   // with <=2 inputs to each half in each half. Once there, we can fall through
10948   // to the generic code below. For example:
10949   //
10950   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10951   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10952   //
10953   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10954   // and an existing 2-into-2 on the other half. In this case we may have to
10955   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10956   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10957   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10958   // because any other situation (including a 3-into-1 or 1-into-3 in the other
10959   // half than the one we target for fixing) will be fixed when we re-enter this
10960   // path. We will also combine away any sequence of PSHUFD instructions that
10961   // result into a single instruction. Here is an example of the tricky case:
10962   //
10963   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10964   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10965   //
10966   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10967   //
10968   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10969   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10970   //
10971   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10972   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10973   //
10974   // The result is fine to be handled by the generic logic.
10975   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10976                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10977                           int AOffset, int BOffset) {
10978     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10979            "Must call this with A having 3 or 1 inputs from the A half.");
10980     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10981            "Must call this with B having 1 or 3 inputs from the B half.");
10982     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10983            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10984
10985     bool ThreeAInputs = AToAInputs.size() == 3;
10986
10987     // Compute the index of dword with only one word among the three inputs in
10988     // a half by taking the sum of the half with three inputs and subtracting
10989     // the sum of the actual three inputs. The difference is the remaining
10990     // slot.
10991     int ADWord, BDWord;
10992     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10993     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10994     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10995     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10996     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10997     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10998     int TripleNonInputIdx =
10999         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11000     TripleDWord = TripleNonInputIdx / 2;
11001
11002     // We use xor with one to compute the adjacent DWord to whichever one the
11003     // OneInput is in.
11004     OneInputDWord = (OneInput / 2) ^ 1;
11005
11006     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11007     // and BToA inputs. If there is also such a problem with the BToB and AToB
11008     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11009     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11010     // is essential that we don't *create* a 3<-1 as then we might oscillate.
11011     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11012       // Compute how many inputs will be flipped by swapping these DWords. We
11013       // need
11014       // to balance this to ensure we don't form a 3-1 shuffle in the other
11015       // half.
11016       int NumFlippedAToBInputs =
11017           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11018           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11019       int NumFlippedBToBInputs =
11020           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11021           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11022       if ((NumFlippedAToBInputs == 1 &&
11023            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11024           (NumFlippedBToBInputs == 1 &&
11025            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11026         // We choose whether to fix the A half or B half based on whether that
11027         // half has zero flipped inputs. At zero, we may not be able to fix it
11028         // with that half. We also bias towards fixing the B half because that
11029         // will more commonly be the high half, and we have to bias one way.
11030         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11031                                                        ArrayRef<int> Inputs) {
11032           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11033           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11034           // Determine whether the free index is in the flipped dword or the
11035           // unflipped dword based on where the pinned index is. We use this bit
11036           // in an xor to conditionally select the adjacent dword.
11037           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11038           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11039           if (IsFixIdxInput == IsFixFreeIdxInput)
11040             FixFreeIdx += 1;
11041           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11042           assert(IsFixIdxInput != IsFixFreeIdxInput &&
11043                  "We need to be changing the number of flipped inputs!");
11044           int PSHUFHalfMask[] = {0, 1, 2, 3};
11045           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11046           V = DAG.getNode(
11047               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11048               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11049               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11050
11051           for (int &M : Mask)
11052             if (M >= 0 && M == FixIdx)
11053               M = FixFreeIdx;
11054             else if (M >= 0 && M == FixFreeIdx)
11055               M = FixIdx;
11056         };
11057         if (NumFlippedBToBInputs != 0) {
11058           int BPinnedIdx =
11059               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11060           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11061         } else {
11062           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11063           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11064           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11065         }
11066       }
11067     }
11068
11069     int PSHUFDMask[] = {0, 1, 2, 3};
11070     PSHUFDMask[ADWord] = BDWord;
11071     PSHUFDMask[BDWord] = ADWord;
11072     V = DAG.getBitcast(
11073         VT,
11074         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11075                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11076
11077     // Adjust the mask to match the new locations of A and B.
11078     for (int &M : Mask)
11079       if (M >= 0 && M/2 == ADWord)
11080         M = 2 * BDWord + M % 2;
11081       else if (M >= 0 && M/2 == BDWord)
11082         M = 2 * ADWord + M % 2;
11083
11084     // Recurse back into this routine to re-compute state now that this isn't
11085     // a 3 and 1 problem.
11086     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11087                                                      DAG);
11088   };
11089   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11090     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11091   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11092     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11093
11094   // At this point there are at most two inputs to the low and high halves from
11095   // each half. That means the inputs can always be grouped into dwords and
11096   // those dwords can then be moved to the correct half with a dword shuffle.
11097   // We use at most one low and one high word shuffle to collect these paired
11098   // inputs into dwords, and finally a dword shuffle to place them.
11099   int PSHUFLMask[4] = {-1, -1, -1, -1};
11100   int PSHUFHMask[4] = {-1, -1, -1, -1};
11101   int PSHUFDMask[4] = {-1, -1, -1, -1};
11102
11103   // First fix the masks for all the inputs that are staying in their
11104   // original halves. This will then dictate the targets of the cross-half
11105   // shuffles.
11106   auto fixInPlaceInputs =
11107       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11108                     MutableArrayRef<int> SourceHalfMask,
11109                     MutableArrayRef<int> HalfMask, int HalfOffset) {
11110     if (InPlaceInputs.empty())
11111       return;
11112     if (InPlaceInputs.size() == 1) {
11113       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11114           InPlaceInputs[0] - HalfOffset;
11115       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11116       return;
11117     }
11118     if (IncomingInputs.empty()) {
11119       // Just fix all of the in place inputs.
11120       for (int Input : InPlaceInputs) {
11121         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11122         PSHUFDMask[Input / 2] = Input / 2;
11123       }
11124       return;
11125     }
11126
11127     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11128     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11129         InPlaceInputs[0] - HalfOffset;
11130     // Put the second input next to the first so that they are packed into
11131     // a dword. We find the adjacent index by toggling the low bit.
11132     int AdjIndex = InPlaceInputs[0] ^ 1;
11133     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11134     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11135     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11136   };
11137   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11138   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11139
11140   // Now gather the cross-half inputs and place them into a free dword of
11141   // their target half.
11142   // FIXME: This operation could almost certainly be simplified dramatically to
11143   // look more like the 3-1 fixing operation.
11144   auto moveInputsToRightHalf = [&PSHUFDMask](
11145       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11146       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11147       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11148       int DestOffset) {
11149     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11150       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11151     };
11152     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11153                                                int Word) {
11154       int LowWord = Word & ~1;
11155       int HighWord = Word | 1;
11156       return isWordClobbered(SourceHalfMask, LowWord) ||
11157              isWordClobbered(SourceHalfMask, HighWord);
11158     };
11159
11160     if (IncomingInputs.empty())
11161       return;
11162
11163     if (ExistingInputs.empty()) {
11164       // Map any dwords with inputs from them into the right half.
11165       for (int Input : IncomingInputs) {
11166         // If the source half mask maps over the inputs, turn those into
11167         // swaps and use the swapped lane.
11168         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11169           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11170             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11171                 Input - SourceOffset;
11172             // We have to swap the uses in our half mask in one sweep.
11173             for (int &M : HalfMask)
11174               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11175                 M = Input;
11176               else if (M == Input)
11177                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11178           } else {
11179             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11180                        Input - SourceOffset &&
11181                    "Previous placement doesn't match!");
11182           }
11183           // Note that this correctly re-maps both when we do a swap and when
11184           // we observe the other side of the swap above. We rely on that to
11185           // avoid swapping the members of the input list directly.
11186           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11187         }
11188
11189         // Map the input's dword into the correct half.
11190         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11191           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11192         else
11193           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11194                      Input / 2 &&
11195                  "Previous placement doesn't match!");
11196       }
11197
11198       // And just directly shift any other-half mask elements to be same-half
11199       // as we will have mirrored the dword containing the element into the
11200       // same position within that half.
11201       for (int &M : HalfMask)
11202         if (M >= SourceOffset && M < SourceOffset + 4) {
11203           M = M - SourceOffset + DestOffset;
11204           assert(M >= 0 && "This should never wrap below zero!");
11205         }
11206       return;
11207     }
11208
11209     // Ensure we have the input in a viable dword of its current half. This
11210     // is particularly tricky because the original position may be clobbered
11211     // by inputs being moved and *staying* in that half.
11212     if (IncomingInputs.size() == 1) {
11213       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11214         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11215                          SourceOffset;
11216         SourceHalfMask[InputFixed - SourceOffset] =
11217             IncomingInputs[0] - SourceOffset;
11218         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11219                      InputFixed);
11220         IncomingInputs[0] = InputFixed;
11221       }
11222     } else if (IncomingInputs.size() == 2) {
11223       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11224           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11225         // We have two non-adjacent or clobbered inputs we need to extract from
11226         // the source half. To do this, we need to map them into some adjacent
11227         // dword slot in the source mask.
11228         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11229                               IncomingInputs[1] - SourceOffset};
11230
11231         // If there is a free slot in the source half mask adjacent to one of
11232         // the inputs, place the other input in it. We use (Index XOR 1) to
11233         // compute an adjacent index.
11234         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11235             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11236           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11237           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11238           InputsFixed[1] = InputsFixed[0] ^ 1;
11239         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11240                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11241           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11242           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11243           InputsFixed[0] = InputsFixed[1] ^ 1;
11244         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11245                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11246           // The two inputs are in the same DWord but it is clobbered and the
11247           // adjacent DWord isn't used at all. Move both inputs to the free
11248           // slot.
11249           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11250           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11251           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11252           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11253         } else {
11254           // The only way we hit this point is if there is no clobbering
11255           // (because there are no off-half inputs to this half) and there is no
11256           // free slot adjacent to one of the inputs. In this case, we have to
11257           // swap an input with a non-input.
11258           for (int i = 0; i < 4; ++i)
11259             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11260                    "We can't handle any clobbers here!");
11261           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11262                  "Cannot have adjacent inputs here!");
11263
11264           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11265           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11266
11267           // We also have to update the final source mask in this case because
11268           // it may need to undo the above swap.
11269           for (int &M : FinalSourceHalfMask)
11270             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11271               M = InputsFixed[1] + SourceOffset;
11272             else if (M == InputsFixed[1] + SourceOffset)
11273               M = (InputsFixed[0] ^ 1) + SourceOffset;
11274
11275           InputsFixed[1] = InputsFixed[0] ^ 1;
11276         }
11277
11278         // Point everything at the fixed inputs.
11279         for (int &M : HalfMask)
11280           if (M == IncomingInputs[0])
11281             M = InputsFixed[0] + SourceOffset;
11282           else if (M == IncomingInputs[1])
11283             M = InputsFixed[1] + SourceOffset;
11284
11285         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11286         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11287       }
11288     } else {
11289       llvm_unreachable("Unhandled input size!");
11290     }
11291
11292     // Now hoist the DWord down to the right half.
11293     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11294     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11295     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11296     for (int &M : HalfMask)
11297       for (int Input : IncomingInputs)
11298         if (M == Input)
11299           M = FreeDWord * 2 + Input % 2;
11300   };
11301   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11302                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
11303   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11304                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
11305
11306   // Now enact all the shuffles we've computed to move the inputs into their
11307   // target half.
11308   if (!isNoopShuffleMask(PSHUFLMask))
11309     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11310                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11311   if (!isNoopShuffleMask(PSHUFHMask))
11312     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11313                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11314   if (!isNoopShuffleMask(PSHUFDMask))
11315     V = DAG.getBitcast(
11316         VT,
11317         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11318                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11319
11320   // At this point, each half should contain all its inputs, and we can then
11321   // just shuffle them into their final position.
11322   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11323          "Failed to lift all the high half inputs to the low mask!");
11324   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11325          "Failed to lift all the low half inputs to the high mask!");
11326
11327   // Do a half shuffle for the low mask.
11328   if (!isNoopShuffleMask(LoMask))
11329     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11330                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11331
11332   // Do a half shuffle with the high mask after shifting its values down.
11333   for (int &M : HiMask)
11334     if (M >= 0)
11335       M -= 4;
11336   if (!isNoopShuffleMask(HiMask))
11337     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11338                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11339
11340   return V;
11341 }
11342
11343 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11344 /// blend if only one input is used.
11345 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11346     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11347     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11348     bool &V2InUse) {
11349   SDValue V1Mask[16];
11350   SDValue V2Mask[16];
11351   V1InUse = false;
11352   V2InUse = false;
11353
11354   int Size = Mask.size();
11355   int Scale = 16 / Size;
11356   for (int i = 0; i < 16; ++i) {
11357     if (Mask[i / Scale] < 0) {
11358       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11359     } else {
11360       const int ZeroMask = 0x80;
11361       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11362                                           : ZeroMask;
11363       int V2Idx = Mask[i / Scale] < Size
11364                       ? ZeroMask
11365                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
11366       if (Zeroable[i / Scale])
11367         V1Idx = V2Idx = ZeroMask;
11368       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11369       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11370       V1InUse |= (ZeroMask != V1Idx);
11371       V2InUse |= (ZeroMask != V2Idx);
11372     }
11373   }
11374
11375   if (V1InUse)
11376     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11377                      DAG.getBitcast(MVT::v16i8, V1),
11378                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11379   if (V2InUse)
11380     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11381                      DAG.getBitcast(MVT::v16i8, V2),
11382                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11383
11384   // If we need shuffled inputs from both, blend the two.
11385   SDValue V;
11386   if (V1InUse && V2InUse)
11387     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11388   else
11389     V = V1InUse ? V1 : V2;
11390
11391   // Cast the result back to the correct type.
11392   return DAG.getBitcast(VT, V);
11393 }
11394
11395 /// \brief Generic lowering of 8-lane i16 shuffles.
11396 ///
11397 /// This handles both single-input shuffles and combined shuffle/blends with
11398 /// two inputs. The single input shuffles are immediately delegated to
11399 /// a dedicated lowering routine.
11400 ///
11401 /// The blends are lowered in one of three fundamental ways. If there are few
11402 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11403 /// of the input is significantly cheaper when lowered as an interleaving of
11404 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11405 /// halves of the inputs separately (making them have relatively few inputs)
11406 /// and then concatenate them.
11407 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11408                                        const APInt &Zeroable,
11409                                        SDValue V1, SDValue V2,
11410                                        const X86Subtarget &Subtarget,
11411                                        SelectionDAG &DAG) {
11412   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11413   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11414   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11415
11416   // Whenever we can lower this as a zext, that instruction is strictly faster
11417   // than any alternative.
11418   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11419           DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11420     return ZExt;
11421
11422   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11423
11424   if (NumV2Inputs == 0) {
11425     // Check for being able to broadcast a single element.
11426     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11427             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11428       return Broadcast;
11429
11430     // Try to use shift instructions.
11431     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11432                                                   Zeroable, Subtarget, DAG))
11433       return Shift;
11434
11435     // Use dedicated unpack instructions for masks that match their pattern.
11436     if (SDValue V =
11437             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11438       return V;
11439
11440     // Try to use byte rotation instructions.
11441     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11442                                                         Mask, Subtarget, DAG))
11443       return Rotate;
11444
11445     // Make a copy of the mask so it can be modified.
11446     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11447     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11448                                                      MutableMask, Subtarget,
11449                                                      DAG);
11450   }
11451
11452   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11453          "All single-input shuffles should be canonicalized to be V1-input "
11454          "shuffles.");
11455
11456   // Try to use shift instructions.
11457   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11458                                                 Zeroable, Subtarget, DAG))
11459     return Shift;
11460
11461   // See if we can use SSE4A Extraction / Insertion.
11462   if (Subtarget.hasSSE4A())
11463     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11464                                                 Zeroable, DAG))
11465       return V;
11466
11467   // There are special ways we can lower some single-element blends.
11468   if (NumV2Inputs == 1)
11469     if (SDValue V = lowerVectorShuffleAsElementInsertion(
11470             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11471       return V;
11472
11473   // We have different paths for blend lowering, but they all must use the
11474   // *exact* same predicate.
11475   bool IsBlendSupported = Subtarget.hasSSE41();
11476   if (IsBlendSupported)
11477     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11478                                                   Zeroable, Subtarget, DAG))
11479       return Blend;
11480
11481   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11482                                                    Zeroable, DAG))
11483     return Masked;
11484
11485   // Use dedicated unpack instructions for masks that match their pattern.
11486   if (SDValue V =
11487           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11488     return V;
11489
11490   // Try to use byte rotation instructions.
11491   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11492           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11493     return Rotate;
11494
11495   if (SDValue BitBlend =
11496           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11497     return BitBlend;
11498
11499   // Try to lower by permuting the inputs into an unpack instruction.
11500   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11501                                                             V2, Mask, DAG))
11502     return Unpack;
11503
11504   // If we can't directly blend but can use PSHUFB, that will be better as it
11505   // can both shuffle and set up the inefficient blend.
11506   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11507     bool V1InUse, V2InUse;
11508     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11509                                               Zeroable, DAG, V1InUse, V2InUse);
11510   }
11511
11512   // We can always bit-blend if we have to so the fallback strategy is to
11513   // decompose into single-input permutes and blends.
11514   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11515                                                     Mask, DAG);
11516 }
11517
11518 /// \brief Check whether a compaction lowering can be done by dropping even
11519 /// elements and compute how many times even elements must be dropped.
11520 ///
11521 /// This handles shuffles which take every Nth element where N is a power of
11522 /// two. Example shuffle masks:
11523 ///
11524 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11525 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11526 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11527 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11528 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11529 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11530 ///
11531 /// Any of these lanes can of course be undef.
11532 ///
11533 /// This routine only supports N <= 3.
11534 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11535 /// for larger N.
11536 ///
11537 /// \returns N above, or the number of times even elements must be dropped if
11538 /// there is such a number. Otherwise returns zero.
11539 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11540                                           bool IsSingleInput) {
11541   // The modulus for the shuffle vector entries is based on whether this is
11542   // a single input or not.
11543   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11544   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11545          "We should only be called with masks with a power-of-2 size!");
11546
11547   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11548
11549   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11550   // and 2^3 simultaneously. This is because we may have ambiguity with
11551   // partially undef inputs.
11552   bool ViableForN[3] = {true, true, true};
11553
11554   for (int i = 0, e = Mask.size(); i < e; ++i) {
11555     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11556     // want.
11557     if (Mask[i] < 0)
11558       continue;
11559
11560     bool IsAnyViable = false;
11561     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11562       if (ViableForN[j]) {
11563         uint64_t N = j + 1;
11564
11565         // The shuffle mask must be equal to (i * 2^N) % M.
11566         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11567           IsAnyViable = true;
11568         else
11569           ViableForN[j] = false;
11570       }
11571     // Early exit if we exhaust the possible powers of two.
11572     if (!IsAnyViable)
11573       break;
11574   }
11575
11576   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11577     if (ViableForN[j])
11578       return j + 1;
11579
11580   // Return 0 as there is no viable power of two.
11581   return 0;
11582 }
11583
11584 /// \brief Generic lowering of v16i8 shuffles.
11585 ///
11586 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11587 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11588 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11589 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11590 /// back together.
11591 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11592                                        const APInt &Zeroable,
11593                                        SDValue V1, SDValue V2,
11594                                        const X86Subtarget &Subtarget,
11595                                        SelectionDAG &DAG) {
11596   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11597   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11598   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11599
11600   // Try to use shift instructions.
11601   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11602                                                 Zeroable, Subtarget, DAG))
11603     return Shift;
11604
11605   // Try to use byte rotation instructions.
11606   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11607           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11608     return Rotate;
11609
11610   // Try to use a zext lowering.
11611   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11612           DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11613     return ZExt;
11614
11615   // See if we can use SSE4A Extraction / Insertion.
11616   if (Subtarget.hasSSE4A())
11617     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11618                                                 Zeroable, DAG))
11619       return V;
11620
11621   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11622
11623   // For single-input shuffles, there are some nicer lowering tricks we can use.
11624   if (NumV2Elements == 0) {
11625     // Check for being able to broadcast a single element.
11626     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11627             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11628       return Broadcast;
11629
11630     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11631     // Notably, this handles splat and partial-splat shuffles more efficiently.
11632     // However, it only makes sense if the pre-duplication shuffle simplifies
11633     // things significantly. Currently, this means we need to be able to
11634     // express the pre-duplication shuffle as an i16 shuffle.
11635     //
11636     // FIXME: We should check for other patterns which can be widened into an
11637     // i16 shuffle as well.
11638     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11639       for (int i = 0; i < 16; i += 2)
11640         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11641           return false;
11642
11643       return true;
11644     };
11645     auto tryToWidenViaDuplication = [&]() -> SDValue {
11646       if (!canWidenViaDuplication(Mask))
11647         return SDValue();
11648       SmallVector<int, 4> LoInputs;
11649       copy_if(Mask, std::back_inserter(LoInputs),
11650               [](int M) { return M >= 0 && M < 8; });
11651       std::sort(LoInputs.begin(), LoInputs.end());
11652       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11653                      LoInputs.end());
11654       SmallVector<int, 4> HiInputs;
11655       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11656       std::sort(HiInputs.begin(), HiInputs.end());
11657       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11658                      HiInputs.end());
11659
11660       bool TargetLo = LoInputs.size() >= HiInputs.size();
11661       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11662       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11663
11664       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11665       SmallDenseMap<int, int, 8> LaneMap;
11666       for (int I : InPlaceInputs) {
11667         PreDupI16Shuffle[I/2] = I/2;
11668         LaneMap[I] = I;
11669       }
11670       int j = TargetLo ? 0 : 4, je = j + 4;
11671       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11672         // Check if j is already a shuffle of this input. This happens when
11673         // there are two adjacent bytes after we move the low one.
11674         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11675           // If we haven't yet mapped the input, search for a slot into which
11676           // we can map it.
11677           while (j < je && PreDupI16Shuffle[j] >= 0)
11678             ++j;
11679
11680           if (j == je)
11681             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11682             return SDValue();
11683
11684           // Map this input with the i16 shuffle.
11685           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11686         }
11687
11688         // Update the lane map based on the mapping we ended up with.
11689         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11690       }
11691       V1 = DAG.getBitcast(
11692           MVT::v16i8,
11693           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11694                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11695
11696       // Unpack the bytes to form the i16s that will be shuffled into place.
11697       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11698                        MVT::v16i8, V1, V1);
11699
11700       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11701       for (int i = 0; i < 16; ++i)
11702         if (Mask[i] >= 0) {
11703           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11704           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11705           if (PostDupI16Shuffle[i / 2] < 0)
11706             PostDupI16Shuffle[i / 2] = MappedMask;
11707           else
11708             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11709                    "Conflicting entries in the original shuffle!");
11710         }
11711       return DAG.getBitcast(
11712           MVT::v16i8,
11713           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11714                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11715     };
11716     if (SDValue V = tryToWidenViaDuplication())
11717       return V;
11718   }
11719
11720   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11721                                                    Zeroable, DAG))
11722     return Masked;
11723
11724   // Use dedicated unpack instructions for masks that match their pattern.
11725   if (SDValue V =
11726           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11727     return V;
11728
11729   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11730   // with PSHUFB. It is important to do this before we attempt to generate any
11731   // blends but after all of the single-input lowerings. If the single input
11732   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11733   // want to preserve that and we can DAG combine any longer sequences into
11734   // a PSHUFB in the end. But once we start blending from multiple inputs,
11735   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11736   // and there are *very* few patterns that would actually be faster than the
11737   // PSHUFB approach because of its ability to zero lanes.
11738   //
11739   // FIXME: The only exceptions to the above are blends which are exact
11740   // interleavings with direct instructions supporting them. We currently don't
11741   // handle those well here.
11742   if (Subtarget.hasSSSE3()) {
11743     bool V1InUse = false;
11744     bool V2InUse = false;
11745
11746     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11747         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11748
11749     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11750     // do so. This avoids using them to handle blends-with-zero which is
11751     // important as a single pshufb is significantly faster for that.
11752     if (V1InUse && V2InUse) {
11753       if (Subtarget.hasSSE41())
11754         if (SDValue Blend = lowerVectorShuffleAsBlend(
11755                 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11756           return Blend;
11757
11758       // We can use an unpack to do the blending rather than an or in some
11759       // cases. Even though the or may be (very minorly) more efficient, we
11760       // preference this lowering because there are common cases where part of
11761       // the complexity of the shuffles goes away when we do the final blend as
11762       // an unpack.
11763       // FIXME: It might be worth trying to detect if the unpack-feeding
11764       // shuffles will both be pshufb, in which case we shouldn't bother with
11765       // this.
11766       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11767               DL, MVT::v16i8, V1, V2, Mask, DAG))
11768         return Unpack;
11769     }
11770
11771     return PSHUFB;
11772   }
11773
11774   // There are special ways we can lower some single-element blends.
11775   if (NumV2Elements == 1)
11776     if (SDValue V = lowerVectorShuffleAsElementInsertion(
11777             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11778       return V;
11779
11780   if (SDValue BitBlend =
11781           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11782     return BitBlend;
11783
11784   // Check whether a compaction lowering can be done. This handles shuffles
11785   // which take every Nth element for some even N. See the helper function for
11786   // details.
11787   //
11788   // We special case these as they can be particularly efficiently handled with
11789   // the PACKUSB instruction on x86 and they show up in common patterns of
11790   // rearranging bytes to truncate wide elements.
11791   bool IsSingleInput = V2.isUndef();
11792   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11793     // NumEvenDrops is the power of two stride of the elements. Another way of
11794     // thinking about it is that we need to drop the even elements this many
11795     // times to get the original input.
11796
11797     // First we need to zero all the dropped bytes.
11798     assert(NumEvenDrops <= 3 &&
11799            "No support for dropping even elements more than 3 times.");
11800     // We use the mask type to pick which bytes are preserved based on how many
11801     // elements are dropped.
11802     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11803     SDValue ByteClearMask = DAG.getBitcast(
11804         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11805     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11806     if (!IsSingleInput)
11807       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11808
11809     // Now pack things back together.
11810     V1 = DAG.getBitcast(MVT::v8i16, V1);
11811     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11812     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11813     for (int i = 1; i < NumEvenDrops; ++i) {
11814       Result = DAG.getBitcast(MVT::v8i16, Result);
11815       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11816     }
11817
11818     return Result;
11819   }
11820
11821   // Handle multi-input cases by blending single-input shuffles.
11822   if (NumV2Elements > 0)
11823     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11824                                                       Mask, DAG);
11825
11826   // The fallback path for single-input shuffles widens this into two v8i16
11827   // vectors with unpacks, shuffles those, and then pulls them back together
11828   // with a pack.
11829   SDValue V = V1;
11830
11831   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11832   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11833   for (int i = 0; i < 16; ++i)
11834     if (Mask[i] >= 0)
11835       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11836
11837   SDValue VLoHalf, VHiHalf;
11838   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11839   // them out and avoid using UNPCK{L,H} to extract the elements of V as
11840   // i16s.
11841   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11842       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11843     // Use a mask to drop the high bytes.
11844     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11845     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11846                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
11847
11848     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11849     VHiHalf = DAG.getUNDEF(MVT::v8i16);
11850
11851     // Squash the masks to point directly into VLoHalf.
11852     for (int &M : LoBlendMask)
11853       if (M >= 0)
11854         M /= 2;
11855     for (int &M : HiBlendMask)
11856       if (M >= 0)
11857         M /= 2;
11858   } else {
11859     // Otherwise just unpack the low half of V into VLoHalf and the high half into
11860     // VHiHalf so that we can blend them as i16s.
11861     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11862
11863     VLoHalf = DAG.getBitcast(
11864         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11865     VHiHalf = DAG.getBitcast(
11866         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11867   }
11868
11869   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11870   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11871
11872   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11873 }
11874
11875 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11876 ///
11877 /// This routine breaks down the specific type of 128-bit shuffle and
11878 /// dispatches to the lowering routines accordingly.
11879 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11880                                         MVT VT, SDValue V1, SDValue V2,
11881                                         const APInt &Zeroable,
11882                                         const X86Subtarget &Subtarget,
11883                                         SelectionDAG &DAG) {
11884   switch (VT.SimpleTy) {
11885   case MVT::v2i64:
11886     return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11887   case MVT::v2f64:
11888     return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11889   case MVT::v4i32:
11890     return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11891   case MVT::v4f32:
11892     return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11893   case MVT::v8i16:
11894     return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11895   case MVT::v16i8:
11896     return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11897
11898   default:
11899     llvm_unreachable("Unimplemented!");
11900   }
11901 }
11902
11903 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11904 ///
11905 /// This routine just extracts two subvectors, shuffles them independently, and
11906 /// then concatenates them back together. This should work effectively with all
11907 /// AVX vector shuffle types.
11908 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11909                                           SDValue V2, ArrayRef<int> Mask,
11910                                           SelectionDAG &DAG) {
11911   assert(VT.getSizeInBits() >= 256 &&
11912          "Only for 256-bit or wider vector shuffles!");
11913   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11914   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11915
11916   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11917   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11918
11919   int NumElements = VT.getVectorNumElements();
11920   int SplitNumElements = NumElements / 2;
11921   MVT ScalarVT = VT.getVectorElementType();
11922   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11923
11924   // Rather than splitting build-vectors, just build two narrower build
11925   // vectors. This helps shuffling with splats and zeros.
11926   auto SplitVector = [&](SDValue V) {
11927     V = peekThroughBitcasts(V);
11928
11929     MVT OrigVT = V.getSimpleValueType();
11930     int OrigNumElements = OrigVT.getVectorNumElements();
11931     int OrigSplitNumElements = OrigNumElements / 2;
11932     MVT OrigScalarVT = OrigVT.getVectorElementType();
11933     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11934
11935     SDValue LoV, HiV;
11936
11937     auto *BV = dyn_cast<BuildVectorSDNode>(V);
11938     if (!BV) {
11939       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11940                         DAG.getIntPtrConstant(0, DL));
11941       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11942                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11943     } else {
11944
11945       SmallVector<SDValue, 16> LoOps, HiOps;
11946       for (int i = 0; i < OrigSplitNumElements; ++i) {
11947         LoOps.push_back(BV->getOperand(i));
11948         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11949       }
11950       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11951       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11952     }
11953     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11954                           DAG.getBitcast(SplitVT, HiV));
11955   };
11956
11957   SDValue LoV1, HiV1, LoV2, HiV2;
11958   std::tie(LoV1, HiV1) = SplitVector(V1);
11959   std::tie(LoV2, HiV2) = SplitVector(V2);
11960
11961   // Now create two 4-way blends of these half-width vectors.
11962   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11963     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11964     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11965     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11966     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11967     for (int i = 0; i < SplitNumElements; ++i) {
11968       int M = HalfMask[i];
11969       if (M >= NumElements) {
11970         if (M >= NumElements + SplitNumElements)
11971           UseHiV2 = true;
11972         else
11973           UseLoV2 = true;
11974         V2BlendMask[i] = M - NumElements;
11975         BlendMask[i] = SplitNumElements + i;
11976       } else if (M >= 0) {
11977         if (M >= SplitNumElements)
11978           UseHiV1 = true;
11979         else
11980           UseLoV1 = true;
11981         V1BlendMask[i] = M;
11982         BlendMask[i] = i;
11983       }
11984     }
11985
11986     // Because the lowering happens after all combining takes place, we need to
11987     // manually combine these blend masks as much as possible so that we create
11988     // a minimal number of high-level vector shuffle nodes.
11989
11990     // First try just blending the halves of V1 or V2.
11991     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11992       return DAG.getUNDEF(SplitVT);
11993     if (!UseLoV2 && !UseHiV2)
11994       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11995     if (!UseLoV1 && !UseHiV1)
11996       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11997
11998     SDValue V1Blend, V2Blend;
11999     if (UseLoV1 && UseHiV1) {
12000       V1Blend =
12001         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12002     } else {
12003       // We only use half of V1 so map the usage down into the final blend mask.
12004       V1Blend = UseLoV1 ? LoV1 : HiV1;
12005       for (int i = 0; i < SplitNumElements; ++i)
12006         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12007           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12008     }
12009     if (UseLoV2 && UseHiV2) {
12010       V2Blend =
12011         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12012     } else {
12013       // We only use half of V2 so map the usage down into the final blend mask.
12014       V2Blend = UseLoV2 ? LoV2 : HiV2;
12015       for (int i = 0; i < SplitNumElements; ++i)
12016         if (BlendMask[i] >= SplitNumElements)
12017           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12018     }
12019     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12020   };
12021   SDValue Lo = HalfBlend(LoMask);
12022   SDValue Hi = HalfBlend(HiMask);
12023   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12024 }
12025
12026 /// \brief Either split a vector in halves or decompose the shuffles and the
12027 /// blend.
12028 ///
12029 /// This is provided as a good fallback for many lowerings of non-single-input
12030 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12031 /// between splitting the shuffle into 128-bit components and stitching those
12032 /// back together vs. extracting the single-input shuffles and blending those
12033 /// results.
12034 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12035                                                 SDValue V1, SDValue V2,
12036                                                 ArrayRef<int> Mask,
12037                                                 SelectionDAG &DAG) {
12038   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12039          "shuffles as it could then recurse on itself.");
12040   int Size = Mask.size();
12041
12042   // If this can be modeled as a broadcast of two elements followed by a blend,
12043   // prefer that lowering. This is especially important because broadcasts can
12044   // often fold with memory operands.
12045   auto DoBothBroadcast = [&] {
12046     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12047     for (int M : Mask)
12048       if (M >= Size) {
12049         if (V2BroadcastIdx < 0)
12050           V2BroadcastIdx = M - Size;
12051         else if (M - Size != V2BroadcastIdx)
12052           return false;
12053       } else if (M >= 0) {
12054         if (V1BroadcastIdx < 0)
12055           V1BroadcastIdx = M;
12056         else if (M != V1BroadcastIdx)
12057           return false;
12058       }
12059     return true;
12060   };
12061   if (DoBothBroadcast())
12062     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12063                                                       DAG);
12064
12065   // If the inputs all stem from a single 128-bit lane of each input, then we
12066   // split them rather than blending because the split will decompose to
12067   // unusually few instructions.
12068   int LaneCount = VT.getSizeInBits() / 128;
12069   int LaneSize = Size / LaneCount;
12070   SmallBitVector LaneInputs[2];
12071   LaneInputs[0].resize(LaneCount, false);
12072   LaneInputs[1].resize(LaneCount, false);
12073   for (int i = 0; i < Size; ++i)
12074     if (Mask[i] >= 0)
12075       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12076   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12077     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12078
12079   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12080   // that the decomposed single-input shuffles don't end up here.
12081   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12082 }
12083
12084 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12085 /// a permutation and blend of those lanes.
12086 ///
12087 /// This essentially blends the out-of-lane inputs to each lane into the lane
12088 /// from a permuted copy of the vector. This lowering strategy results in four
12089 /// instructions in the worst case for a single-input cross lane shuffle which
12090 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12091 /// of. Special cases for each particular shuffle pattern should be handled
12092 /// prior to trying this lowering.
12093 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12094                                                        SDValue V1, SDValue V2,
12095                                                        ArrayRef<int> Mask,
12096                                                        SelectionDAG &DAG) {
12097   // FIXME: This should probably be generalized for 512-bit vectors as well.
12098   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12099   int Size = Mask.size();
12100   int LaneSize = Size / 2;
12101
12102   // If there are only inputs from one 128-bit lane, splitting will in fact be
12103   // less expensive. The flags track whether the given lane contains an element
12104   // that crosses to another lane.
12105   bool LaneCrossing[2] = {false, false};
12106   for (int i = 0; i < Size; ++i)
12107     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12108       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12109   if (!LaneCrossing[0] || !LaneCrossing[1])
12110     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12111
12112   assert(V2.isUndef() &&
12113          "This last part of this routine only works on single input shuffles");
12114
12115   SmallVector<int, 32> FlippedBlendMask(Size);
12116   for (int i = 0; i < Size; ++i)
12117     FlippedBlendMask[i] =
12118         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12119                                 ? Mask[i]
12120                                 : Mask[i] % LaneSize +
12121                                       (i / LaneSize) * LaneSize + Size);
12122
12123   // Flip the vector, and blend the results which should now be in-lane. The
12124   // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
12125   // 5 for the high source. The value 3 selects the high half of source 2 and
12126   // the value 2 selects the low half of source 2. We only use source 2 to
12127   // allow folding it into a memory operand.
12128   unsigned PERMMask = 3 | 2 << 4;
12129   SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12130                                 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12131   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12132 }
12133
12134 /// \brief Handle lowering 2-lane 128-bit shuffles.
12135 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12136                                         SDValue V2, ArrayRef<int> Mask,
12137                                         const APInt &Zeroable,
12138                                         const X86Subtarget &Subtarget,
12139                                         SelectionDAG &DAG) {
12140   SmallVector<int, 4> WidenedMask;
12141   if (!canWidenShuffleElements(Mask, WidenedMask))
12142     return SDValue();
12143
12144   // TODO: If minimizing size and one of the inputs is a zero vector and the
12145   // the zero vector has only one use, we could use a VPERM2X128 to save the
12146   // instruction bytes needed to explicitly generate the zero vector.
12147
12148   // Blends are faster and handle all the non-lane-crossing cases.
12149   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12150                                                 Zeroable, Subtarget, DAG))
12151     return Blend;
12152
12153   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12154   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12155
12156   // If either input operand is a zero vector, use VPERM2X128 because its mask
12157   // allows us to replace the zero input with an implicit zero.
12158   if (!IsV1Zero && !IsV2Zero) {
12159     // Check for patterns which can be matched with a single insert of a 128-bit
12160     // subvector.
12161     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12162     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12163       // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
12164       if (Subtarget.hasAVX2() && V2.isUndef())
12165         return SDValue();
12166
12167       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12168       // this will likely become vinsertf128 which can't fold a 256-bit memop.
12169       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12170         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12171                                      VT.getVectorNumElements() / 2);
12172         SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12173                                   DAG.getIntPtrConstant(0, DL));
12174         SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12175                                   OnlyUsesV1 ? V1 : V2,
12176                                   DAG.getIntPtrConstant(0, DL));
12177         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12178       }
12179     }
12180   }
12181
12182   // Otherwise form a 128-bit permutation. After accounting for undefs,
12183   // convert the 64-bit shuffle mask selection values into 128-bit
12184   // selection bits by dividing the indexes by 2 and shifting into positions
12185   // defined by a vperm2*128 instruction's immediate control byte.
12186
12187   // The immediate permute control byte looks like this:
12188   //    [1:0] - select 128 bits from sources for low half of destination
12189   //    [2]   - ignore
12190   //    [3]   - zero low half of destination
12191   //    [5:4] - select 128 bits from sources for high half of destination
12192   //    [6]   - ignore
12193   //    [7]   - zero high half of destination
12194
12195   int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12196   int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12197
12198   unsigned PermMask = MaskLO | (MaskHI << 4);
12199
12200   // If either input is a zero vector, replace it with an undef input.
12201   // Shuffle mask values <  4 are selecting elements of V1.
12202   // Shuffle mask values >= 4 are selecting elements of V2.
12203   // Adjust each half of the permute mask by clearing the half that was
12204   // selecting the zero vector and setting the zero mask bit.
12205   if (IsV1Zero) {
12206     V1 = DAG.getUNDEF(VT);
12207     if (MaskLO < 2)
12208       PermMask = (PermMask & 0xf0) | 0x08;
12209     if (MaskHI < 2)
12210       PermMask = (PermMask & 0x0f) | 0x80;
12211   }
12212   if (IsV2Zero) {
12213     V2 = DAG.getUNDEF(VT);
12214     if (MaskLO >= 2)
12215       PermMask = (PermMask & 0xf0) | 0x08;
12216     if (MaskHI >= 2)
12217       PermMask = (PermMask & 0x0f) | 0x80;
12218   }
12219
12220   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12221                      DAG.getConstant(PermMask, DL, MVT::i8));
12222 }
12223
12224 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12225 /// shuffling each lane.
12226 ///
12227 /// This will only succeed when the result of fixing the 128-bit lanes results
12228 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12229 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12230 /// the lane crosses early and then use simpler shuffles within each lane.
12231 ///
12232 /// FIXME: It might be worthwhile at some point to support this without
12233 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12234 /// in x86 only floating point has interesting non-repeating shuffles, and even
12235 /// those are still *marginally* more expensive.
12236 static SDValue lowerVectorShuffleByMerging128BitLanes(
12237     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12238     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12239   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12240
12241   int Size = Mask.size();
12242   int LaneSize = 128 / VT.getScalarSizeInBits();
12243   int NumLanes = Size / LaneSize;
12244   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12245
12246   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12247   // check whether the in-128-bit lane shuffles share a repeating pattern.
12248   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12249   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12250   for (int i = 0; i < Size; ++i) {
12251     if (Mask[i] < 0)
12252       continue;
12253
12254     int j = i / LaneSize;
12255
12256     if (Lanes[j] < 0) {
12257       // First entry we've seen for this lane.
12258       Lanes[j] = Mask[i] / LaneSize;
12259     } else if (Lanes[j] != Mask[i] / LaneSize) {
12260       // This doesn't match the lane selected previously!
12261       return SDValue();
12262     }
12263
12264     // Check that within each lane we have a consistent shuffle mask.
12265     int k = i % LaneSize;
12266     if (InLaneMask[k] < 0) {
12267       InLaneMask[k] = Mask[i] % LaneSize;
12268     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12269       // This doesn't fit a repeating in-lane mask.
12270       return SDValue();
12271     }
12272   }
12273
12274   // First shuffle the lanes into place.
12275   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12276                                 VT.getSizeInBits() / 64);
12277   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12278   for (int i = 0; i < NumLanes; ++i)
12279     if (Lanes[i] >= 0) {
12280       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12281       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12282     }
12283
12284   V1 = DAG.getBitcast(LaneVT, V1);
12285   V2 = DAG.getBitcast(LaneVT, V2);
12286   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12287
12288   // Cast it back to the type we actually want.
12289   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12290
12291   // Now do a simple shuffle that isn't lane crossing.
12292   SmallVector<int, 8> NewMask((unsigned)Size, -1);
12293   for (int i = 0; i < Size; ++i)
12294     if (Mask[i] >= 0)
12295       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12296   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12297          "Must not introduce lane crosses at this point!");
12298
12299   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12300 }
12301
12302 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12303 /// This allows for fast cases such as subvector extraction/insertion
12304 /// or shuffling smaller vector types which can lower more efficiently.
12305 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12306                                                SDValue V1, SDValue V2,
12307                                                ArrayRef<int> Mask,
12308                                                const X86Subtarget &Subtarget,
12309                                                SelectionDAG &DAG) {
12310   assert(VT.is256BitVector() && "Expected 256-bit vector");
12311
12312   unsigned NumElts = VT.getVectorNumElements();
12313   unsigned HalfNumElts = NumElts / 2;
12314   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12315
12316   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12317   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12318   if (!UndefLower && !UndefUpper)
12319     return SDValue();
12320
12321   // Upper half is undef and lower half is whole upper subvector.
12322   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12323   if (UndefUpper &&
12324       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12325     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12326                              DAG.getIntPtrConstant(HalfNumElts, DL));
12327     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12328                        DAG.getIntPtrConstant(0, DL));
12329   }
12330
12331   // Lower half is undef and upper half is whole lower subvector.
12332   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12333   if (UndefLower &&
12334       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12335     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12336                              DAG.getIntPtrConstant(0, DL));
12337     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12338                        DAG.getIntPtrConstant(HalfNumElts, DL));
12339   }
12340
12341   // If the shuffle only uses two of the four halves of the input operands,
12342   // then extract them and perform the 'half' shuffle at half width.
12343   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12344   int HalfIdx1 = -1, HalfIdx2 = -1;
12345   SmallVector<int, 8> HalfMask(HalfNumElts);
12346   unsigned Offset = UndefLower ? HalfNumElts : 0;
12347   for (unsigned i = 0; i != HalfNumElts; ++i) {
12348     int M = Mask[i + Offset];
12349     if (M < 0) {
12350       HalfMask[i] = M;
12351       continue;
12352     }
12353
12354     // Determine which of the 4 half vectors this element is from.
12355     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12356     int HalfIdx = M / HalfNumElts;
12357
12358     // Determine the element index into its half vector source.
12359     int HalfElt = M % HalfNumElts;
12360
12361     // We can shuffle with up to 2 half vectors, set the new 'half'
12362     // shuffle mask accordingly.
12363     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12364       HalfMask[i] = HalfElt;
12365       HalfIdx1 = HalfIdx;
12366       continue;
12367     }
12368     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12369       HalfMask[i] = HalfElt + HalfNumElts;
12370       HalfIdx2 = HalfIdx;
12371       continue;
12372     }
12373
12374     // Too many half vectors referenced.
12375     return SDValue();
12376   }
12377   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12378
12379   // Only shuffle the halves of the inputs when useful.
12380   int NumLowerHalves =
12381       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12382   int NumUpperHalves =
12383       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12384
12385   // uuuuXXXX - don't extract uppers just to insert again.
12386   if (UndefLower && NumUpperHalves != 0)
12387     return SDValue();
12388
12389   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12390   if (UndefUpper && NumUpperHalves == 2)
12391     return SDValue();
12392
12393   // AVX2 - XXXXuuuu - always extract lowers.
12394   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12395     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12396     if (VT == MVT::v4f64 || VT == MVT::v4i64)
12397       return SDValue();
12398     // AVX2 supports variable 32-bit element cross-lane shuffles.
12399     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12400       // XXXXuuuu - don't extract lowers and uppers.
12401       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12402         return SDValue();
12403     }
12404   }
12405
12406   auto GetHalfVector = [&](int HalfIdx) {
12407     if (HalfIdx < 0)
12408       return DAG.getUNDEF(HalfVT);
12409     SDValue V = (HalfIdx < 2 ? V1 : V2);
12410     HalfIdx = (HalfIdx % 2) * HalfNumElts;
12411     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12412                        DAG.getIntPtrConstant(HalfIdx, DL));
12413   };
12414
12415   SDValue Half1 = GetHalfVector(HalfIdx1);
12416   SDValue Half2 = GetHalfVector(HalfIdx2);
12417   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12418   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12419                      DAG.getIntPtrConstant(Offset, DL));
12420 }
12421
12422 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12423 /// given mask.
12424 ///
12425 /// This returns true if the elements from a particular input are already in the
12426 /// slot required by the given mask and require no permutation.
12427 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12428   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12429   int Size = Mask.size();
12430   for (int i = 0; i < Size; ++i)
12431     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12432       return false;
12433
12434   return true;
12435 }
12436
12437 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12438 /// every lane can be represented as the same repeating mask - allowing us to
12439 /// shuffle the sources with the repeating shuffle and then permute the result
12440 /// to the destination lanes.
12441 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12442     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12443     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12444   int NumElts = VT.getVectorNumElements();
12445   int NumLanes = VT.getSizeInBits() / 128;
12446   int NumLaneElts = NumElts / NumLanes;
12447
12448   // On AVX2 we may be able to just shuffle the lowest elements and then
12449   // broadcast the result.
12450   if (Subtarget.hasAVX2()) {
12451     for (unsigned BroadcastSize : {16, 32, 64}) {
12452       if (BroadcastSize <= VT.getScalarSizeInBits())
12453         continue;
12454       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12455
12456       // Attempt to match a repeating pattern every NumBroadcastElts,
12457       // accounting for UNDEFs but only references the lowest 128-bit
12458       // lane of the inputs.
12459       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12460         for (int i = 0; i != NumElts; i += NumBroadcastElts)
12461           for (int j = 0; j != NumBroadcastElts; ++j) {
12462             int M = Mask[i + j];
12463             if (M < 0)
12464               continue;
12465             int &R = RepeatMask[j];
12466             if (0 != ((M % NumElts) / NumLaneElts))
12467               return false;
12468             if (0 <= R && R != M)
12469               return false;
12470             R = M;
12471           }
12472         return true;
12473       };
12474
12475       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12476       if (!FindRepeatingBroadcastMask(RepeatMask))
12477         continue;
12478
12479       // Shuffle the (lowest) repeated elements in place for broadcast.
12480       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12481
12482       // Shuffle the actual broadcast.
12483       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12484       for (int i = 0; i != NumElts; i += NumBroadcastElts)
12485         for (int j = 0; j != NumBroadcastElts; ++j)
12486           BroadcastMask[i + j] = j;
12487       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12488                                   BroadcastMask);
12489     }
12490   }
12491
12492   // Bail if the shuffle mask doesn't cross 128-bit lanes.
12493   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12494     return SDValue();
12495
12496   // Bail if we already have a repeated lane shuffle mask.
12497   SmallVector<int, 8> RepeatedShuffleMask;
12498   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12499     return SDValue();
12500
12501   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12502   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12503   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12504   int NumSubLanes = NumLanes * SubLaneScale;
12505   int NumSubLaneElts = NumLaneElts / SubLaneScale;
12506
12507   // Check that all the sources are coming from the same lane and see if we can
12508   // form a repeating shuffle mask (local to each sub-lane). At the same time,
12509   // determine the source sub-lane for each destination sub-lane.
12510   int TopSrcSubLane = -1;
12511   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12512   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12513       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12514       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12515
12516   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12517     // Extract the sub-lane mask, check that it all comes from the same lane
12518     // and normalize the mask entries to come from the first lane.
12519     int SrcLane = -1;
12520     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12521     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12522       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12523       if (M < 0)
12524         continue;
12525       int Lane = (M % NumElts) / NumLaneElts;
12526       if ((0 <= SrcLane) && (SrcLane != Lane))
12527         return SDValue();
12528       SrcLane = Lane;
12529       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12530       SubLaneMask[Elt] = LocalM;
12531     }
12532
12533     // Whole sub-lane is UNDEF.
12534     if (SrcLane < 0)
12535       continue;
12536
12537     // Attempt to match against the candidate repeated sub-lane masks.
12538     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12539       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12540         for (int i = 0; i != NumSubLaneElts; ++i) {
12541           if (M1[i] < 0 || M2[i] < 0)
12542             continue;
12543           if (M1[i] != M2[i])
12544             return false;
12545         }
12546         return true;
12547       };
12548
12549       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12550       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12551         continue;
12552
12553       // Merge the sub-lane mask into the matching repeated sub-lane mask.
12554       for (int i = 0; i != NumSubLaneElts; ++i) {
12555         int M = SubLaneMask[i];
12556         if (M < 0)
12557           continue;
12558         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12559                "Unexpected mask element");
12560         RepeatedSubLaneMask[i] = M;
12561       }
12562
12563       // Track the top most source sub-lane - by setting the remaining to UNDEF
12564       // we can greatly simplify shuffle matching.
12565       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12566       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12567       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12568       break;
12569     }
12570
12571     // Bail if we failed to find a matching repeated sub-lane mask.
12572     if (Dst2SrcSubLanes[DstSubLane] < 0)
12573       return SDValue();
12574   }
12575   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12576          "Unexpected source lane");
12577
12578   // Create a repeating shuffle mask for the entire vector.
12579   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12580   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12581     int Lane = SubLane / SubLaneScale;
12582     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12583     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12584       int M = RepeatedSubLaneMask[Elt];
12585       if (M < 0)
12586         continue;
12587       int Idx = (SubLane * NumSubLaneElts) + Elt;
12588       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12589     }
12590   }
12591   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12592
12593   // Shuffle each source sub-lane to its destination.
12594   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12595   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12596     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12597     if (SrcSubLane < 0)
12598       continue;
12599     for (int j = 0; j != NumSubLaneElts; ++j)
12600       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12601   }
12602
12603   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12604                               SubLaneMask);
12605 }
12606
12607 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12608                                          unsigned &ShuffleImm,
12609                                          ArrayRef<int> Mask) {
12610   int NumElts = VT.getVectorNumElements();
12611   assert(VT.getScalarSizeInBits() == 64 &&
12612          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12613          "Unexpected data type for VSHUFPD");
12614
12615   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
12616   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
12617   ShuffleImm = 0;
12618   bool ShufpdMask = true;
12619   bool CommutableMask = true;
12620   for (int i = 0; i < NumElts; ++i) {
12621     if (Mask[i] == SM_SentinelUndef)
12622       continue;
12623     if (Mask[i] < 0)
12624       return false;
12625     int Val = (i & 6) + NumElts * (i & 1);
12626     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12627     if (Mask[i] < Val || Mask[i] > Val + 1)
12628       ShufpdMask = false;
12629     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12630       CommutableMask = false;
12631     ShuffleImm |= (Mask[i] % 2) << i;
12632   }
12633
12634   if (ShufpdMask)
12635     return true;
12636   if (CommutableMask) {
12637     std::swap(V1, V2);
12638     return true;
12639   }
12640
12641   return false;
12642 }
12643
12644 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12645                                             ArrayRef<int> Mask, SDValue V1,
12646                                             SDValue V2, SelectionDAG &DAG) {
12647   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12648          "Unexpected data type for VSHUFPD");
12649
12650   unsigned Immediate = 0;
12651   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12652     return SDValue();
12653
12654   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12655                      DAG.getConstant(Immediate, DL, MVT::i8));
12656 }
12657
12658 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12659                                            ArrayRef<int> Mask, SDValue V1,
12660                                            SDValue V2, SelectionDAG &DAG) {
12661   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12662   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12663
12664   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12665   if (V2.isUndef())
12666     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12667
12668   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12669 }
12670
12671 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12672 ///
12673 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12674 /// isn't available.
12675 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12676                                        const APInt &Zeroable,
12677                                        SDValue V1, SDValue V2,
12678                                        const X86Subtarget &Subtarget,
12679                                        SelectionDAG &DAG) {
12680   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12681   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12682   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12683
12684   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12685                                            Zeroable, Subtarget, DAG))
12686     return V;
12687
12688   if (V2.isUndef()) {
12689     // Check for being able to broadcast a single element.
12690     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12691             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12692       return Broadcast;
12693
12694     // Use low duplicate instructions for masks that match their pattern.
12695     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12696       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12697
12698     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12699       // Non-half-crossing single input shuffles can be lowered with an
12700       // interleaved permutation.
12701       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12702                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12703       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12704                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12705     }
12706
12707     // With AVX2 we have direct support for this permutation.
12708     if (Subtarget.hasAVX2())
12709       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12710                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12711
12712     // Try to create an in-lane repeating shuffle mask and then shuffle the
12713     // the results into the target lanes.
12714     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12715             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12716       return V;
12717
12718     // Otherwise, fall back.
12719     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12720                                                    DAG);
12721   }
12722
12723   // Use dedicated unpack instructions for masks that match their pattern.
12724   if (SDValue V =
12725           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12726     return V;
12727
12728   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12729                                                 Zeroable, Subtarget, DAG))
12730     return Blend;
12731
12732   // Check if the blend happens to exactly fit that of SHUFPD.
12733   if (SDValue Op =
12734       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12735     return Op;
12736
12737   // Try to create an in-lane repeating shuffle mask and then shuffle the
12738   // the results into the target lanes.
12739   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12740           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12741     return V;
12742
12743   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12744   // shuffle. However, if we have AVX2 and either inputs are already in place,
12745   // we will be able to shuffle even across lanes the other input in a single
12746   // instruction so skip this pattern.
12747   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12748                                 isShuffleMaskInputInPlace(1, Mask))))
12749     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12750             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12751       return Result;
12752   // If we have VLX support, we can use VEXPAND.
12753   if (Subtarget.hasVLX())
12754     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12755                                                V1, V2, DAG, Subtarget))
12756       return V;
12757
12758   // If we have AVX2 then we always want to lower with a blend because an v4 we
12759   // can fully permute the elements.
12760   if (Subtarget.hasAVX2())
12761     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12762                                                       Mask, DAG);
12763
12764   // Otherwise fall back on generic lowering.
12765   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12766 }
12767
12768 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12769 ///
12770 /// This routine is only called when we have AVX2 and thus a reasonable
12771 /// instruction set for v4i64 shuffling..
12772 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12773                                        const APInt &Zeroable,
12774                                        SDValue V1, SDValue V2,
12775                                        const X86Subtarget &Subtarget,
12776                                        SelectionDAG &DAG) {
12777   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12778   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12779   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12780   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12781
12782   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12783                                            Zeroable, Subtarget, DAG))
12784     return V;
12785
12786   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12787                                                 Zeroable, Subtarget, DAG))
12788     return Blend;
12789
12790   // Check for being able to broadcast a single element.
12791   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12792                                                         Mask, Subtarget, DAG))
12793     return Broadcast;
12794
12795   if (V2.isUndef()) {
12796     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12797     // can use lower latency instructions that will operate on both lanes.
12798     SmallVector<int, 2> RepeatedMask;
12799     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12800       SmallVector<int, 4> PSHUFDMask;
12801       scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12802       return DAG.getBitcast(
12803           MVT::v4i64,
12804           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12805                       DAG.getBitcast(MVT::v8i32, V1),
12806                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12807     }
12808
12809     // AVX2 provides a direct instruction for permuting a single input across
12810     // lanes.
12811     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12812                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12813   }
12814
12815   // Try to use shift instructions.
12816   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12817                                                 Zeroable, Subtarget, DAG))
12818     return Shift;
12819
12820   // If we have VLX support, we can use VALIGN or VEXPAND.
12821   if (Subtarget.hasVLX()) {
12822     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12823                                                     Mask, Subtarget, DAG))
12824       return Rotate;
12825
12826     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12827                                                V1, V2, DAG, Subtarget))
12828       return V;
12829   }
12830
12831   // Try to use PALIGNR.
12832   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12833                                                       Mask, Subtarget, DAG))
12834     return Rotate;
12835
12836   // Use dedicated unpack instructions for masks that match their pattern.
12837   if (SDValue V =
12838           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12839     return V;
12840
12841   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12842   // shuffle. However, if we have AVX2 and either inputs are already in place,
12843   // we will be able to shuffle even across lanes the other input in a single
12844   // instruction so skip this pattern.
12845   if (!isShuffleMaskInputInPlace(0, Mask) &&
12846       !isShuffleMaskInputInPlace(1, Mask))
12847     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12848             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12849       return Result;
12850
12851   // Otherwise fall back on generic blend lowering.
12852   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12853                                                     Mask, DAG);
12854 }
12855
12856 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12857 ///
12858 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12859 /// isn't available.
12860 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12861                                        const APInt &Zeroable,
12862                                        SDValue V1, SDValue V2,
12863                                        const X86Subtarget &Subtarget,
12864                                        SelectionDAG &DAG) {
12865   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12866   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12867   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12868
12869   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12870                                                 Zeroable, Subtarget, DAG))
12871     return Blend;
12872
12873   // Check for being able to broadcast a single element.
12874   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12875                                                         Mask, Subtarget, DAG))
12876     return Broadcast;
12877
12878   // If the shuffle mask is repeated in each 128-bit lane, we have many more
12879   // options to efficiently lower the shuffle.
12880   SmallVector<int, 4> RepeatedMask;
12881   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12882     assert(RepeatedMask.size() == 4 &&
12883            "Repeated masks must be half the mask width!");
12884
12885     // Use even/odd duplicate instructions for masks that match their pattern.
12886     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12887       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12888     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12889       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12890
12891     if (V2.isUndef())
12892       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12893                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12894
12895     // Use dedicated unpack instructions for masks that match their pattern.
12896     if (SDValue V =
12897             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12898       return V;
12899
12900     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12901     // have already handled any direct blends.
12902     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12903   }
12904
12905   // Try to create an in-lane repeating shuffle mask and then shuffle the
12906   // the results into the target lanes.
12907   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12908           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12909     return V;
12910
12911   // If we have a single input shuffle with different shuffle patterns in the
12912   // two 128-bit lanes use the variable mask to VPERMILPS.
12913   if (V2.isUndef()) {
12914     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12915     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12916       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12917
12918     if (Subtarget.hasAVX2())
12919       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12920
12921     // Otherwise, fall back.
12922     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12923                                                    DAG);
12924   }
12925
12926   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12927   // shuffle.
12928   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12929           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12930     return Result;
12931   // If we have VLX support, we can use VEXPAND.
12932   if (Subtarget.hasVLX())
12933     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12934                                                V1, V2, DAG, Subtarget))
12935       return V;
12936
12937   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12938   // since after split we get a more efficient code using vpunpcklwd and
12939   // vpunpckhwd instrs than vblend.
12940   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12941     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12942                                                      Mask, DAG))
12943       return V;
12944
12945   // If we have AVX2 then we always want to lower with a blend because at v8 we
12946   // can fully permute the elements.
12947   if (Subtarget.hasAVX2())
12948     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12949                                                       Mask, DAG);
12950
12951   // Otherwise fall back on generic lowering.
12952   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12953 }
12954
12955 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12956 ///
12957 /// This routine is only called when we have AVX2 and thus a reasonable
12958 /// instruction set for v8i32 shuffling..
12959 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12960                                        const APInt &Zeroable,
12961                                        SDValue V1, SDValue V2,
12962                                        const X86Subtarget &Subtarget,
12963                                        SelectionDAG &DAG) {
12964   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12965   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12966   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12967   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12968
12969   // Whenever we can lower this as a zext, that instruction is strictly faster
12970   // than any alternative. It also allows us to fold memory operands into the
12971   // shuffle in many cases.
12972   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12973           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12974     return ZExt;
12975
12976   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12977   // since after split we get a more efficient code than vblend by using
12978   // vpunpcklwd and vpunpckhwd instrs.
12979   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12980       !Subtarget.hasAVX512())
12981     if (SDValue V =
12982             lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12983       return V;
12984
12985   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12986                                                 Zeroable, Subtarget, DAG))
12987     return Blend;
12988
12989   // Check for being able to broadcast a single element.
12990   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12991                                                         Mask, Subtarget, DAG))
12992     return Broadcast;
12993
12994   // If the shuffle mask is repeated in each 128-bit lane we can use more
12995   // efficient instructions that mirror the shuffles across the two 128-bit
12996   // lanes.
12997   SmallVector<int, 4> RepeatedMask;
12998   bool Is128BitLaneRepeatedShuffle =
12999       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13000   if (Is128BitLaneRepeatedShuffle) {
13001     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13002     if (V2.isUndef())
13003       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13004                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13005
13006     // Use dedicated unpack instructions for masks that match their pattern.
13007     if (SDValue V =
13008             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13009       return V;
13010   }
13011
13012   // Try to use shift instructions.
13013   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13014                                                 Zeroable, Subtarget, DAG))
13015     return Shift;
13016
13017   // If we have VLX support, we can use VALIGN or EXPAND.
13018   if (Subtarget.hasVLX()) {
13019     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13020                                                     Mask, Subtarget, DAG))
13021       return Rotate;
13022
13023     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13024                                                V1, V2, DAG, Subtarget))
13025       return V;
13026   }
13027
13028   // Try to use byte rotation instructions.
13029   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13030           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13031     return Rotate;
13032
13033   // Try to create an in-lane repeating shuffle mask and then shuffle the
13034   // results into the target lanes.
13035   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13036           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13037     return V;
13038
13039   // If the shuffle patterns aren't repeated but it is a single input, directly
13040   // generate a cross-lane VPERMD instruction.
13041   if (V2.isUndef()) {
13042     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13043     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13044   }
13045
13046   // Assume that a single SHUFPS is faster than an alternative sequence of
13047   // multiple instructions (even if the CPU has a domain penalty).
13048   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13049   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13050     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13051     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13052     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13053                                                   CastV1, CastV2, DAG);
13054     return DAG.getBitcast(MVT::v8i32, ShufPS);
13055   }
13056
13057   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13058   // shuffle.
13059   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13060           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13061     return Result;
13062
13063   // Otherwise fall back on generic blend lowering.
13064   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13065                                                     Mask, DAG);
13066 }
13067
13068 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13069 ///
13070 /// This routine is only called when we have AVX2 and thus a reasonable
13071 /// instruction set for v16i16 shuffling..
13072 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13073                                         const APInt &Zeroable,
13074                                         SDValue V1, SDValue V2,
13075                                         const X86Subtarget &Subtarget,
13076                                         SelectionDAG &DAG) {
13077   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13078   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13079   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13080   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13081
13082   // Whenever we can lower this as a zext, that instruction is strictly faster
13083   // than any alternative. It also allows us to fold memory operands into the
13084   // shuffle in many cases.
13085   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13086           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13087     return ZExt;
13088
13089   // Check for being able to broadcast a single element.
13090   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13091                                                         Mask, Subtarget, DAG))
13092     return Broadcast;
13093
13094   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13095                                                 Zeroable, Subtarget, DAG))
13096     return Blend;
13097
13098   // Use dedicated unpack instructions for masks that match their pattern.
13099   if (SDValue V =
13100           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13101     return V;
13102
13103   // Try to use shift instructions.
13104   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13105                                                 Zeroable, Subtarget, DAG))
13106     return Shift;
13107
13108   // Try to use byte rotation instructions.
13109   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13110           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13111     return Rotate;
13112
13113   // Try to create an in-lane repeating shuffle mask and then shuffle the
13114   // the results into the target lanes.
13115   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13116           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13117     return V;
13118
13119   if (V2.isUndef()) {
13120     // There are no generalized cross-lane shuffle operations available on i16
13121     // element types.
13122     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13123       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13124                                                      Mask, DAG);
13125
13126     SmallVector<int, 8> RepeatedMask;
13127     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13128       // As this is a single-input shuffle, the repeated mask should be
13129       // a strictly valid v8i16 mask that we can pass through to the v8i16
13130       // lowering to handle even the v16 case.
13131       return lowerV8I16GeneralSingleInputVectorShuffle(
13132           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13133     }
13134   }
13135
13136   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13137           DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13138     return PSHUFB;
13139
13140   // AVX512BWVL can lower to VPERMW.
13141   if (Subtarget.hasBWI() && Subtarget.hasVLX())
13142     return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13143
13144   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13145   // shuffle.
13146   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13147           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13148     return Result;
13149
13150   // Otherwise fall back on generic lowering.
13151   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13152 }
13153
13154 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13155 ///
13156 /// This routine is only called when we have AVX2 and thus a reasonable
13157 /// instruction set for v32i8 shuffling..
13158 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13159                                        const APInt &Zeroable,
13160                                        SDValue V1, SDValue V2,
13161                                        const X86Subtarget &Subtarget,
13162                                        SelectionDAG &DAG) {
13163   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13164   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13165   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13166   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13167
13168   // Whenever we can lower this as a zext, that instruction is strictly faster
13169   // than any alternative. It also allows us to fold memory operands into the
13170   // shuffle in many cases.
13171   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13172           DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13173     return ZExt;
13174
13175   // Check for being able to broadcast a single element.
13176   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13177                                                         Mask, Subtarget, DAG))
13178     return Broadcast;
13179
13180   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13181                                                 Zeroable, Subtarget, DAG))
13182     return Blend;
13183
13184   // Use dedicated unpack instructions for masks that match their pattern.
13185   if (SDValue V =
13186           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13187     return V;
13188
13189   // Try to use shift instructions.
13190   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13191                                                 Zeroable, Subtarget, DAG))
13192     return Shift;
13193
13194   // Try to use byte rotation instructions.
13195   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13196           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13197     return Rotate;
13198
13199   // Try to create an in-lane repeating shuffle mask and then shuffle the
13200   // the results into the target lanes.
13201   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13202           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13203     return V;
13204
13205   // There are no generalized cross-lane shuffle operations available on i8
13206   // element types.
13207   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13208     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13209                                                    DAG);
13210
13211   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13212           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13213     return PSHUFB;
13214
13215   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13216   // shuffle.
13217   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13218           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13219     return Result;
13220
13221   // Otherwise fall back on generic lowering.
13222   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13223 }
13224
13225 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13226 ///
13227 /// This routine either breaks down the specific type of a 256-bit x86 vector
13228 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13229 /// together based on the available instructions.
13230 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13231                                         MVT VT, SDValue V1, SDValue V2,
13232                                         const APInt &Zeroable,
13233                                         const X86Subtarget &Subtarget,
13234                                         SelectionDAG &DAG) {
13235   // If we have a single input to the zero element, insert that into V1 if we
13236   // can do so cheaply.
13237   int NumElts = VT.getVectorNumElements();
13238   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13239
13240   if (NumV2Elements == 1 && Mask[0] >= NumElts)
13241     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13242             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13243       return Insertion;
13244
13245   // Handle special cases where the lower or upper half is UNDEF.
13246   if (SDValue V =
13247           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13248     return V;
13249
13250   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13251   // can check for those subtargets here and avoid much of the subtarget
13252   // querying in the per-vector-type lowering routines. With AVX1 we have
13253   // essentially *zero* ability to manipulate a 256-bit vector with integer
13254   // types. Since we'll use floating point types there eventually, just
13255   // immediately cast everything to a float and operate entirely in that domain.
13256   if (VT.isInteger() && !Subtarget.hasAVX2()) {
13257     int ElementBits = VT.getScalarSizeInBits();
13258     if (ElementBits < 32) {
13259       // No floating point type available, if we can't use the bit operations
13260       // for masking/blending then decompose into 128-bit vectors.
13261       if (SDValue V =
13262               lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13263         return V;
13264       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13265         return V;
13266       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13267     }
13268
13269     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13270                                 VT.getVectorNumElements());
13271     V1 = DAG.getBitcast(FpVT, V1);
13272     V2 = DAG.getBitcast(FpVT, V2);
13273     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13274   }
13275
13276   switch (VT.SimpleTy) {
13277   case MVT::v4f64:
13278     return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13279   case MVT::v4i64:
13280     return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13281   case MVT::v8f32:
13282     return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13283   case MVT::v8i32:
13284     return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13285   case MVT::v16i16:
13286     return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13287   case MVT::v32i8:
13288     return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13289
13290   default:
13291     llvm_unreachable("Not a valid 256-bit x86 vector type!");
13292   }
13293 }
13294
13295 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13296 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13297                                         ArrayRef<int> Mask, SDValue V1,
13298                                         SDValue V2, SelectionDAG &DAG) {
13299   assert(VT.getScalarSizeInBits() == 64 &&
13300          "Unexpected element type size for 128bit shuffle.");
13301
13302   // To handle 256 bit vector requires VLX and most probably
13303   // function lowerV2X128VectorShuffle() is better solution.
13304   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13305
13306   SmallVector<int, 4> WidenedMask;
13307   if (!canWidenShuffleElements(Mask, WidenedMask))
13308     return SDValue();
13309
13310   // Check for patterns which can be matched with a single insert of a 256-bit
13311   // subvector.
13312   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13313                                         {0, 1, 2, 3, 0, 1, 2, 3});
13314   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13315                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
13316     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13317     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13318                               DAG.getIntPtrConstant(0, DL));
13319     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13320                               OnlyUsesV1 ? V1 : V2,
13321                               DAG.getIntPtrConstant(0, DL));
13322     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13323   }
13324
13325   assert(WidenedMask.size() == 4);
13326
13327   // See if this is an insertion of the lower 128-bits of V2 into V1.
13328   bool IsInsert = true;
13329   int V2Index = -1;
13330   for (int i = 0; i < 4; ++i) {
13331     assert(WidenedMask[i] >= -1);
13332     if (WidenedMask[i] < 0)
13333       continue;
13334
13335     // Make sure all V1 subvectors are in place.
13336     if (WidenedMask[i] < 4) {
13337       if (WidenedMask[i] != i) {
13338         IsInsert = false;
13339         break;
13340       }
13341     } else {
13342       // Make sure we only have a single V2 index and its the lowest 128-bits.
13343       if (V2Index >= 0 || WidenedMask[i] != 4) {
13344         IsInsert = false;
13345         break;
13346       }
13347       V2Index = i;
13348     }
13349   }
13350   if (IsInsert && V2Index >= 0) {
13351     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13352     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13353                                  DAG.getIntPtrConstant(0, DL));
13354     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13355   }
13356
13357   // Try to lower to to vshuf64x2/vshuf32x4.
13358   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13359   unsigned PermMask = 0;
13360   // Insure elements came from the same Op.
13361   for (int i = 0; i < 4; ++i) {
13362     assert(WidenedMask[i] >= -1);
13363     if (WidenedMask[i] < 0)
13364       continue;
13365
13366     SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13367     unsigned OpIndex = i / 2;
13368     if (Ops[OpIndex].isUndef())
13369       Ops[OpIndex] = Op;
13370     else if (Ops[OpIndex] != Op)
13371       return SDValue();
13372
13373     // Convert the 128-bit shuffle mask selection values into 128-bit selection
13374     // bits defined by a vshuf64x2 instruction's immediate control byte.
13375     PermMask |= (WidenedMask[i] % 4) << (i * 2);
13376   }
13377
13378   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13379                      DAG.getConstant(PermMask, DL, MVT::i8));
13380 }
13381
13382 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13383 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13384                                        const APInt &Zeroable,
13385                                        SDValue V1, SDValue V2,
13386                                        const X86Subtarget &Subtarget,
13387                                        SelectionDAG &DAG) {
13388   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13389   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13390   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13391
13392   if (V2.isUndef()) {
13393     // Use low duplicate instructions for masks that match their pattern.
13394     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13395       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13396
13397     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13398       // Non-half-crossing single input shuffles can be lowered with an
13399       // interleaved permutation.
13400       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13401                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13402                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13403                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13404       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13405                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13406     }
13407
13408     SmallVector<int, 4> RepeatedMask;
13409     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13410       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13411                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13412   }
13413
13414   if (SDValue Shuf128 =
13415           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13416     return Shuf128;
13417
13418   if (SDValue Unpck =
13419           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13420     return Unpck;
13421
13422   // Check if the blend happens to exactly fit that of SHUFPD.
13423   if (SDValue Op =
13424       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13425     return Op;
13426
13427   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13428                                              V2, DAG, Subtarget))
13429     return V;
13430
13431   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13432                                                 Zeroable, Subtarget, DAG))
13433     return Blend;
13434
13435   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13436 }
13437
13438 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13439 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13440                                         const APInt &Zeroable,
13441                                         SDValue V1, SDValue V2,
13442                                         const X86Subtarget &Subtarget,
13443                                         SelectionDAG &DAG) {
13444   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13445   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13446   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13447
13448   // If the shuffle mask is repeated in each 128-bit lane, we have many more
13449   // options to efficiently lower the shuffle.
13450   SmallVector<int, 4> RepeatedMask;
13451   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13452     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13453
13454     // Use even/odd duplicate instructions for masks that match their pattern.
13455     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13456       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13457     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13458       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13459
13460     if (V2.isUndef())
13461       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13462                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13463
13464     // Use dedicated unpack instructions for masks that match their pattern.
13465     if (SDValue Unpck =
13466             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13467       return Unpck;
13468
13469     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13470                                                   Zeroable, Subtarget, DAG))
13471       return Blend;
13472
13473     // Otherwise, fall back to a SHUFPS sequence.
13474     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13475   }
13476   // If we have AVX512F support, we can use VEXPAND.
13477   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13478                                              V1, V2, DAG, Subtarget))
13479     return V;
13480
13481   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13482 }
13483
13484 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13485 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13486                                        const APInt &Zeroable,
13487                                        SDValue V1, SDValue V2,
13488                                        const X86Subtarget &Subtarget,
13489                                        SelectionDAG &DAG) {
13490   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13491   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13492   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13493
13494   if (SDValue Shuf128 =
13495           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13496     return Shuf128;
13497
13498   if (V2.isUndef()) {
13499     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13500     // can use lower latency instructions that will operate on all four
13501     // 128-bit lanes.
13502     SmallVector<int, 2> Repeated128Mask;
13503     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13504       SmallVector<int, 4> PSHUFDMask;
13505       scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13506       return DAG.getBitcast(
13507           MVT::v8i64,
13508           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13509                       DAG.getBitcast(MVT::v16i32, V1),
13510                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13511     }
13512
13513     SmallVector<int, 4> Repeated256Mask;
13514     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13515       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13516                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13517   }
13518
13519   // Try to use shift instructions.
13520   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13521                                                 Zeroable, Subtarget, DAG))
13522     return Shift;
13523
13524   // Try to use VALIGN.
13525   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13526                                                   Mask, Subtarget, DAG))
13527     return Rotate;
13528
13529   // Try to use PALIGNR.
13530   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13531                                                       Mask, Subtarget, DAG))
13532     return Rotate;
13533
13534   if (SDValue Unpck =
13535           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13536     return Unpck;
13537   // If we have AVX512F support, we can use VEXPAND.
13538   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13539                                              V2, DAG, Subtarget))
13540     return V;
13541
13542   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13543                                                 Zeroable, Subtarget, DAG))
13544     return Blend;
13545
13546   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13547 }
13548
13549 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13550 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13551                                         const APInt &Zeroable,
13552                                         SDValue V1, SDValue V2,
13553                                         const X86Subtarget &Subtarget,
13554                                         SelectionDAG &DAG) {
13555   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13556   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13557   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13558
13559   // Whenever we can lower this as a zext, that instruction is strictly faster
13560   // than any alternative. It also allows us to fold memory operands into the
13561   // shuffle in many cases.
13562   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13563           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13564     return ZExt;
13565
13566   // If the shuffle mask is repeated in each 128-bit lane we can use more
13567   // efficient instructions that mirror the shuffles across the four 128-bit
13568   // lanes.
13569   SmallVector<int, 4> RepeatedMask;
13570   bool Is128BitLaneRepeatedShuffle =
13571       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13572   if (Is128BitLaneRepeatedShuffle) {
13573     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13574     if (V2.isUndef())
13575       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13576                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13577
13578     // Use dedicated unpack instructions for masks that match their pattern.
13579     if (SDValue V =
13580             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13581       return V;
13582   }
13583
13584   // Try to use shift instructions.
13585   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13586                                                 Zeroable, Subtarget, DAG))
13587     return Shift;
13588
13589   // Try to use VALIGN.
13590   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13591                                                   Mask, Subtarget, DAG))
13592     return Rotate;
13593
13594   // Try to use byte rotation instructions.
13595   if (Subtarget.hasBWI())
13596     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13597             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13598       return Rotate;
13599
13600   // Assume that a single SHUFPS is faster than using a permv shuffle.
13601   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13602   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13603     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13604     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13605     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13606                                                   CastV1, CastV2, DAG);
13607     return DAG.getBitcast(MVT::v16i32, ShufPS);
13608   }
13609   // If we have AVX512F support, we can use VEXPAND.
13610   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13611                                              V1, V2, DAG, Subtarget))
13612     return V;
13613
13614   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13615                                                 Zeroable, Subtarget, DAG))
13616     return Blend;
13617   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13618 }
13619
13620 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13621 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13622                                         const APInt &Zeroable,
13623                                         SDValue V1, SDValue V2,
13624                                         const X86Subtarget &Subtarget,
13625                                         SelectionDAG &DAG) {
13626   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13627   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13628   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13629   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13630
13631   // Whenever we can lower this as a zext, that instruction is strictly faster
13632   // than any alternative. It also allows us to fold memory operands into the
13633   // shuffle in many cases.
13634   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13635           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13636     return ZExt;
13637
13638   // Use dedicated unpack instructions for masks that match their pattern.
13639   if (SDValue V =
13640           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13641     return V;
13642
13643   // Try to use shift instructions.
13644   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13645                                                 Zeroable, Subtarget, DAG))
13646     return Shift;
13647
13648   // Try to use byte rotation instructions.
13649   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13650           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13651     return Rotate;
13652
13653   if (V2.isUndef()) {
13654     SmallVector<int, 8> RepeatedMask;
13655     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13656       // As this is a single-input shuffle, the repeated mask should be
13657       // a strictly valid v8i16 mask that we can pass through to the v8i16
13658       // lowering to handle even the v32 case.
13659       return lowerV8I16GeneralSingleInputVectorShuffle(
13660           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13661     }
13662   }
13663
13664   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13665                                                 Zeroable, Subtarget, DAG))
13666     return Blend;
13667
13668   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13669 }
13670
13671 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13672 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13673                                        const APInt &Zeroable,
13674                                        SDValue V1, SDValue V2,
13675                                        const X86Subtarget &Subtarget,
13676                                        SelectionDAG &DAG) {
13677   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13678   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13679   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13680   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13681
13682   // Whenever we can lower this as a zext, that instruction is strictly faster
13683   // than any alternative. It also allows us to fold memory operands into the
13684   // shuffle in many cases.
13685   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13686           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13687     return ZExt;
13688
13689   // Use dedicated unpack instructions for masks that match their pattern.
13690   if (SDValue V =
13691           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13692     return V;
13693
13694   // Try to use shift instructions.
13695   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13696                                                 Zeroable, Subtarget, DAG))
13697     return Shift;
13698
13699   // Try to use byte rotation instructions.
13700   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13701           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13702     return Rotate;
13703
13704   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13705           DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13706     return PSHUFB;
13707
13708   // VBMI can use VPERMV/VPERMV3 byte shuffles.
13709   if (Subtarget.hasVBMI())
13710     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13711
13712   // Try to create an in-lane repeating shuffle mask and then shuffle the
13713   // the results into the target lanes.
13714   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13715           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13716     return V;
13717
13718   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13719                                                 Zeroable, Subtarget, DAG))
13720     return Blend;
13721
13722   // FIXME: Implement direct support for this type!
13723   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13724 }
13725
13726 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13727 ///
13728 /// This routine either breaks down the specific type of a 512-bit x86 vector
13729 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13730 /// together based on the available instructions.
13731 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13732                                         MVT VT, SDValue V1, SDValue V2,
13733                                         const APInt &Zeroable,
13734                                         const X86Subtarget &Subtarget,
13735                                         SelectionDAG &DAG) {
13736   assert(Subtarget.hasAVX512() &&
13737          "Cannot lower 512-bit vectors w/ basic ISA!");
13738
13739   // If we have a single input to the zero element, insert that into V1 if we
13740   // can do so cheaply.
13741   int NumElts = Mask.size();
13742   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13743
13744   if (NumV2Elements == 1 && Mask[0] >= NumElts)
13745     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13746             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13747       return Insertion;
13748
13749   // Check for being able to broadcast a single element.
13750   if (SDValue Broadcast =
13751           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13752     return Broadcast;
13753
13754   // Dispatch to each element type for lowering. If we don't have support for
13755   // specific element type shuffles at 512 bits, immediately split them and
13756   // lower them. Each lowering routine of a given type is allowed to assume that
13757   // the requisite ISA extensions for that element type are available.
13758   switch (VT.SimpleTy) {
13759   case MVT::v8f64:
13760     return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13761   case MVT::v16f32:
13762     return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13763   case MVT::v8i64:
13764     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13765   case MVT::v16i32:
13766     return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13767   case MVT::v32i16:
13768     return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13769   case MVT::v64i8:
13770     return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13771
13772   default:
13773     llvm_unreachable("Not a valid 512-bit x86 vector type!");
13774   }
13775 }
13776
13777 // Lower vXi1 vector shuffles.
13778 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13779 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13780 // vector, shuffle and then truncate it back.
13781 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13782                                       MVT VT, SDValue V1, SDValue V2,
13783                                       const X86Subtarget &Subtarget,
13784                                       SelectionDAG &DAG) {
13785   assert(Subtarget.hasAVX512() &&
13786          "Cannot lower 512-bit vectors w/o basic ISA!");
13787   MVT ExtVT;
13788   switch (VT.SimpleTy) {
13789   default:
13790     llvm_unreachable("Expected a vector of i1 elements");
13791   case MVT::v2i1:
13792     ExtVT = MVT::v2i64;
13793     break;
13794   case MVT::v4i1:
13795     ExtVT = MVT::v4i32;
13796     break;
13797   case MVT::v8i1:
13798     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13799     break;
13800   case MVT::v16i1:
13801     ExtVT = MVT::v16i32;
13802     break;
13803   case MVT::v32i1:
13804     ExtVT = MVT::v32i16;
13805     break;
13806   case MVT::v64i1:
13807     ExtVT = MVT::v64i8;
13808     break;
13809   }
13810
13811   if (ISD::isBuildVectorAllZeros(V1.getNode()))
13812     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13813   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13814     V1 = getOnesVector(ExtVT, DAG, DL);
13815   else
13816     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13817
13818   if (V2.isUndef())
13819     V2 = DAG.getUNDEF(ExtVT);
13820   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13821     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13822   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13823     V2 = getOnesVector(ExtVT, DAG, DL);
13824   else
13825     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13826
13827   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13828   // i1 was sign extended we can use X86ISD::CVT2MASK.
13829   int NumElems = VT.getVectorNumElements();
13830   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13831       (Subtarget.hasDQI() && (NumElems < 32)))
13832     return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13833
13834   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13835 }
13836
13837 /// Helper function that returns true if the shuffle mask should be
13838 /// commuted to improve canonicalization.
13839 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13840   int NumElements = Mask.size();
13841
13842   int NumV1Elements = 0, NumV2Elements = 0;
13843   for (int M : Mask)
13844     if (M < 0)
13845       continue;
13846     else if (M < NumElements)
13847       ++NumV1Elements;
13848     else
13849       ++NumV2Elements;
13850
13851   // Commute the shuffle as needed such that more elements come from V1 than
13852   // V2. This allows us to match the shuffle pattern strictly on how many
13853   // elements come from V1 without handling the symmetric cases.
13854   if (NumV2Elements > NumV1Elements)
13855     return true;
13856
13857   assert(NumV1Elements > 0 && "No V1 indices");
13858
13859   if (NumV2Elements == 0)
13860     return false;
13861
13862   // When the number of V1 and V2 elements are the same, try to minimize the
13863   // number of uses of V2 in the low half of the vector. When that is tied,
13864   // ensure that the sum of indices for V1 is equal to or lower than the sum
13865   // indices for V2. When those are equal, try to ensure that the number of odd
13866   // indices for V1 is lower than the number of odd indices for V2.
13867   if (NumV1Elements == NumV2Elements) {
13868     int LowV1Elements = 0, LowV2Elements = 0;
13869     for (int M : Mask.slice(0, NumElements / 2))
13870       if (M >= NumElements)
13871         ++LowV2Elements;
13872       else if (M >= 0)
13873         ++LowV1Elements;
13874     if (LowV2Elements > LowV1Elements)
13875       return true;
13876     if (LowV2Elements == LowV1Elements) {
13877       int SumV1Indices = 0, SumV2Indices = 0;
13878       for (int i = 0, Size = Mask.size(); i < Size; ++i)
13879         if (Mask[i] >= NumElements)
13880           SumV2Indices += i;
13881         else if (Mask[i] >= 0)
13882           SumV1Indices += i;
13883       if (SumV2Indices < SumV1Indices)
13884         return true;
13885       if (SumV2Indices == SumV1Indices) {
13886         int NumV1OddIndices = 0, NumV2OddIndices = 0;
13887         for (int i = 0, Size = Mask.size(); i < Size; ++i)
13888           if (Mask[i] >= NumElements)
13889             NumV2OddIndices += i % 2;
13890           else if (Mask[i] >= 0)
13891             NumV1OddIndices += i % 2;
13892         if (NumV2OddIndices < NumV1OddIndices)
13893           return true;
13894       }
13895     }
13896   }
13897
13898   return false;
13899 }
13900
13901 /// \brief Top-level lowering for x86 vector shuffles.
13902 ///
13903 /// This handles decomposition, canonicalization, and lowering of all x86
13904 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13905 /// above in helper routines. The canonicalization attempts to widen shuffles
13906 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13907 /// s.t. only one of the two inputs needs to be tested, etc.
13908 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13909                                   SelectionDAG &DAG) {
13910   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13911   ArrayRef<int> Mask = SVOp->getMask();
13912   SDValue V1 = Op.getOperand(0);
13913   SDValue V2 = Op.getOperand(1);
13914   MVT VT = Op.getSimpleValueType();
13915   int NumElements = VT.getVectorNumElements();
13916   SDLoc DL(Op);
13917   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13918
13919   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13920          "Can't lower MMX shuffles");
13921
13922   bool V1IsUndef = V1.isUndef();
13923   bool V2IsUndef = V2.isUndef();
13924   if (V1IsUndef && V2IsUndef)
13925     return DAG.getUNDEF(VT);
13926
13927   // When we create a shuffle node we put the UNDEF node to second operand,
13928   // but in some cases the first operand may be transformed to UNDEF.
13929   // In this case we should just commute the node.
13930   if (V1IsUndef)
13931     return DAG.getCommutedVectorShuffle(*SVOp);
13932
13933   // Check for non-undef masks pointing at an undef vector and make the masks
13934   // undef as well. This makes it easier to match the shuffle based solely on
13935   // the mask.
13936   if (V2IsUndef)
13937     for (int M : Mask)
13938       if (M >= NumElements) {
13939         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13940         for (int &M : NewMask)
13941           if (M >= NumElements)
13942             M = -1;
13943         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13944       }
13945
13946   // Check for illegal shuffle mask element index values.
13947   int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13948   assert(llvm::all_of(Mask,
13949                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13950          "Out of bounds shuffle index");
13951
13952   // We actually see shuffles that are entirely re-arrangements of a set of
13953   // zero inputs. This mostly happens while decomposing complex shuffles into
13954   // simple ones. Directly lower these as a buildvector of zeros.
13955   APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13956   if (Zeroable.isAllOnesValue())
13957     return getZeroVector(VT, Subtarget, DAG, DL);
13958
13959   // Try to collapse shuffles into using a vector type with fewer elements but
13960   // wider element types. We cap this to not form integers or floating point
13961   // elements wider than 64 bits, but it might be interesting to form i128
13962   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13963   SmallVector<int, 16> WidenedMask;
13964   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13965       canWidenShuffleElements(Mask, WidenedMask)) {
13966     MVT NewEltVT = VT.isFloatingPoint()
13967                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13968                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13969     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13970     // Make sure that the new vector type is legal. For example, v2f64 isn't
13971     // legal on SSE1.
13972     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13973       V1 = DAG.getBitcast(NewVT, V1);
13974       V2 = DAG.getBitcast(NewVT, V2);
13975       return DAG.getBitcast(
13976           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13977     }
13978   }
13979
13980   // Commute the shuffle if it will improve canonicalization.
13981   if (canonicalizeShuffleMaskWithCommute(Mask))
13982     return DAG.getCommutedVectorShuffle(*SVOp);
13983
13984   // For each vector width, delegate to a specialized lowering routine.
13985   if (VT.is128BitVector())
13986     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13987                                     DAG);
13988
13989   if (VT.is256BitVector())
13990     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13991                                     DAG);
13992
13993   if (VT.is512BitVector())
13994     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13995                                     DAG);
13996
13997   if (Is1BitVector)
13998     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13999
14000   llvm_unreachable("Unimplemented!");
14001 }
14002
14003 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14004 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14005                                            const X86Subtarget &Subtarget,
14006                                            SelectionDAG &DAG) {
14007   SDValue Cond = Op.getOperand(0);
14008   SDValue LHS = Op.getOperand(1);
14009   SDValue RHS = Op.getOperand(2);
14010   SDLoc dl(Op);
14011   MVT VT = Op.getSimpleValueType();
14012
14013   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14014     return SDValue();
14015   auto *CondBV = cast<BuildVectorSDNode>(Cond);
14016
14017   // Only non-legal VSELECTs reach this lowering, convert those into generic
14018   // shuffles and re-use the shuffle lowering path for blends.
14019   SmallVector<int, 32> Mask;
14020   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14021     SDValue CondElt = CondBV->getOperand(i);
14022     Mask.push_back(
14023         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14024                                      : -1);
14025   }
14026   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14027 }
14028
14029 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14030   // A vselect where all conditions and data are constants can be optimized into
14031   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14032   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14033       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14034       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14035     return SDValue();
14036
14037   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14038   // with patterns on the mask registers on AVX-512.
14039   if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14040     return Op;
14041
14042   // Try to lower this to a blend-style vector shuffle. This can handle all
14043   // constant condition cases.
14044   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14045     return BlendOp;
14046
14047   // Variable blends are only legal from SSE4.1 onward.
14048   if (!Subtarget.hasSSE41())
14049     return SDValue();
14050
14051   SDLoc dl(Op);
14052   MVT VT = Op.getSimpleValueType();
14053
14054   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14055   // into an i1 condition so that we can use the mask-based 512-bit blend
14056   // instructions.
14057   if (VT.getSizeInBits() == 512) {
14058     SDValue Cond = Op.getOperand(0);
14059     // The vNi1 condition case should be handled above as it can be trivially
14060     // lowered.
14061     assert(Cond.getValueType().getScalarSizeInBits() ==
14062                VT.getScalarSizeInBits() &&
14063            "Should have a size-matched integer condition!");
14064     // Build a mask by testing the condition against itself (tests for zero).
14065     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14066     SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14067     // Now return a new VSELECT using the mask.
14068     return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14069   }
14070
14071   // Only some types will be legal on some subtargets. If we can emit a legal
14072   // VSELECT-matching blend, return Op, and but if we need to expand, return
14073   // a null value.
14074   switch (VT.SimpleTy) {
14075   default:
14076     // Most of the vector types have blends past SSE4.1.
14077     return Op;
14078
14079   case MVT::v32i8:
14080     // The byte blends for AVX vectors were introduced only in AVX2.
14081     if (Subtarget.hasAVX2())
14082       return Op;
14083
14084     return SDValue();
14085
14086   case MVT::v8i16:
14087   case MVT::v16i16:
14088     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
14089     if (Subtarget.hasBWI() && Subtarget.hasVLX())
14090       return Op;
14091
14092     // FIXME: We should custom lower this by fixing the condition and using i8
14093     // blends.
14094     return SDValue();
14095   }
14096 }
14097
14098 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14099   MVT VT = Op.getSimpleValueType();
14100   SDLoc dl(Op);
14101
14102   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14103     return SDValue();
14104
14105   if (VT.getSizeInBits() == 8) {
14106     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14107                                   Op.getOperand(0), Op.getOperand(1));
14108     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14109                                   DAG.getValueType(VT));
14110     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14111   }
14112
14113   if (VT == MVT::f32) {
14114     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14115     // the result back to FR32 register. It's only worth matching if the
14116     // result has a single use which is a store or a bitcast to i32.  And in
14117     // the case of a store, it's not worth it if the index is a constant 0,
14118     // because a MOVSSmr can be used instead, which is smaller and faster.
14119     if (!Op.hasOneUse())
14120       return SDValue();
14121     SDNode *User = *Op.getNode()->use_begin();
14122     if ((User->getOpcode() != ISD::STORE ||
14123          isNullConstant(Op.getOperand(1))) &&
14124         (User->getOpcode() != ISD::BITCAST ||
14125          User->getValueType(0) != MVT::i32))
14126       return SDValue();
14127     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14128                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14129                                   Op.getOperand(1));
14130     return DAG.getBitcast(MVT::f32, Extract);
14131   }
14132
14133   if (VT == MVT::i32 || VT == MVT::i64) {
14134     // ExtractPS/pextrq works with constant index.
14135     if (isa<ConstantSDNode>(Op.getOperand(1)))
14136       return Op;
14137   }
14138
14139   return SDValue();
14140 }
14141
14142 /// Extract one bit from mask vector, like v16i1 or v8i1.
14143 /// AVX-512 feature.
14144 SDValue
14145 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14146   SDValue Vec = Op.getOperand(0);
14147   SDLoc dl(Vec);
14148   MVT VecVT = Vec.getSimpleValueType();
14149   SDValue Idx = Op.getOperand(1);
14150   MVT EltVT = Op.getSimpleValueType();
14151
14152   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14153          "Unexpected vector type in ExtractBitFromMaskVector");
14154
14155   // variable index can't be handled in mask registers,
14156   // extend vector to VR512/128
14157   if (!isa<ConstantSDNode>(Idx)) {
14158     unsigned NumElts = VecVT.getVectorNumElements();
14159     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14160     // than extending to 128/256bit.
14161     unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14162     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14163     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14164     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14165                               ExtVT.getVectorElementType(), Ext, Idx);
14166     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14167   }
14168
14169   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14170   if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14171       (VecVT.getVectorNumElements() < 8)) {
14172     // Use kshiftlw/rw instruction.
14173     VecVT = MVT::v16i1;
14174     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14175                       DAG.getUNDEF(VecVT),
14176                       Vec,
14177                       DAG.getIntPtrConstant(0, dl));
14178   }
14179   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14180   if (MaxSift - IdxVal)
14181     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14182                       DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14183   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14184                     DAG.getConstant(MaxSift, dl, MVT::i8));
14185   return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14186                      DAG.getIntPtrConstant(0, dl));
14187 }
14188
14189 SDValue
14190 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14191                                            SelectionDAG &DAG) const {
14192   SDLoc dl(Op);
14193   SDValue Vec = Op.getOperand(0);
14194   MVT VecVT = Vec.getSimpleValueType();
14195   SDValue Idx = Op.getOperand(1);
14196
14197   if (VecVT.getVectorElementType() == MVT::i1)
14198     return ExtractBitFromMaskVector(Op, DAG);
14199
14200   if (!isa<ConstantSDNode>(Idx)) {
14201     // Its more profitable to go through memory (1 cycles throughput)
14202     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14203     // IACA tool was used to get performance estimation
14204     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14205     //
14206     // example : extractelement <16 x i8> %a, i32 %i
14207     //
14208     // Block Throughput: 3.00 Cycles
14209     // Throughput Bottleneck: Port5
14210     //
14211     // | Num Of |   Ports pressure in cycles  |    |
14212     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
14213     // ---------------------------------------------
14214     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
14215     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
14216     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
14217     // Total Num Of Uops: 4
14218     //
14219     //
14220     // Block Throughput: 1.00 Cycles
14221     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14222     //
14223     // |    |  Ports pressure in cycles   |  |
14224     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
14225     // ---------------------------------------------------------
14226     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14227     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
14228     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
14229     // Total Num Of Uops: 4
14230
14231     return SDValue();
14232   }
14233
14234   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14235
14236   // If this is a 256-bit vector result, first extract the 128-bit vector and
14237   // then extract the element from the 128-bit vector.
14238   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14239     // Get the 128-bit vector.
14240     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14241     MVT EltVT = VecVT.getVectorElementType();
14242
14243     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14244     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14245
14246     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14247     // this can be done with a mask.
14248     IdxVal &= ElemsPerChunk - 1;
14249     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14250                        DAG.getConstant(IdxVal, dl, MVT::i32));
14251   }
14252
14253   assert(VecVT.is128BitVector() && "Unexpected vector length");
14254
14255   MVT VT = Op.getSimpleValueType();
14256
14257   if (VT.getSizeInBits() == 16) {
14258     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14259     // we're going to zero extend the register or fold the store (SSE41 only).
14260     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14261         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14262       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14263                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14264                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
14265
14266     // Transform it so it match pextrw which produces a 32-bit result.
14267     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14268                                   Op.getOperand(0), Op.getOperand(1));
14269     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14270                                   DAG.getValueType(VT));
14271     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14272   }
14273
14274   if (Subtarget.hasSSE41())
14275     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14276       return Res;
14277
14278   // TODO: We only extract a single element from v16i8, we can probably afford
14279   // to be more aggressive here before using the default approach of spilling to
14280   // stack.
14281   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14282     // Extract either the lowest i32 or any i16, and extract the sub-byte.
14283     int DWordIdx = IdxVal / 4;
14284     if (DWordIdx == 0) {
14285       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14286                                 DAG.getBitcast(MVT::v4i32, Vec),
14287                                 DAG.getIntPtrConstant(DWordIdx, dl));
14288       int ShiftVal = (IdxVal % 4) * 8;
14289       if (ShiftVal != 0)
14290         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14291                           DAG.getConstant(ShiftVal, dl, MVT::i32));
14292       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14293     }
14294
14295     int WordIdx = IdxVal / 2;
14296     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14297                               DAG.getBitcast(MVT::v8i16, Vec),
14298                               DAG.getIntPtrConstant(WordIdx, dl));
14299     int ShiftVal = (IdxVal % 2) * 8;
14300     if (ShiftVal != 0)
14301       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14302                         DAG.getConstant(ShiftVal, dl, MVT::i16));
14303     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14304   }
14305
14306   if (VT.getSizeInBits() == 32) {
14307     if (IdxVal == 0)
14308       return Op;
14309
14310     // SHUFPS the element to the lowest double word, then movss.
14311     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14312     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14313     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14314                        DAG.getIntPtrConstant(0, dl));
14315   }
14316
14317   if (VT.getSizeInBits() == 64) {
14318     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14319     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14320     //        to match extract_elt for f64.
14321     if (IdxVal == 0)
14322       return Op;
14323
14324     // UNPCKHPD the element to the lowest double word, then movsd.
14325     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14326     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14327     int Mask[2] = { 1, -1 };
14328     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14329     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14330                        DAG.getIntPtrConstant(0, dl));
14331   }
14332
14333   return SDValue();
14334 }
14335
14336 /// Insert one bit to mask vector, like v16i1 or v8i1.
14337 /// AVX-512 feature.
14338 SDValue
14339 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14340   SDLoc dl(Op);
14341   SDValue Vec = Op.getOperand(0);
14342   SDValue Elt = Op.getOperand(1);
14343   SDValue Idx = Op.getOperand(2);
14344   MVT VecVT = Vec.getSimpleValueType();
14345
14346   if (!isa<ConstantSDNode>(Idx)) {
14347     // Non constant index. Extend source and destination,
14348     // insert element and then truncate the result.
14349     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
14350     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
14351     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14352       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14353       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14354     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14355   }
14356
14357   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14358   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14359   unsigned NumElems = VecVT.getVectorNumElements();
14360
14361   if(Vec.isUndef()) {
14362     if (IdxVal)
14363       EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14364                              DAG.getConstant(IdxVal, dl, MVT::i8));
14365     return EltInVec;
14366   }
14367
14368   // Insertion of one bit into first position
14369   if (IdxVal == 0 ) {
14370     // Clean top bits of vector.
14371     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14372                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
14373     EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14374                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
14375     // Clean the first bit in source vector.
14376     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14377                       DAG.getConstant(1 , dl, MVT::i8));
14378     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14379                       DAG.getConstant(1, dl, MVT::i8));
14380
14381     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14382   }
14383   // Insertion of one bit into last position
14384   if (IdxVal == NumElems -1) {
14385     // Move the bit to the last position inside the vector.
14386     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14387                            DAG.getConstant(IdxVal, dl, MVT::i8));
14388     // Clean the last bit in the source vector.
14389     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14390                            DAG.getConstant(1, dl, MVT::i8));
14391     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14392                            DAG.getConstant(1 , dl, MVT::i8));
14393
14394     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14395   }
14396
14397   // Use shuffle to insert element.
14398   SmallVector<int, 64> MaskVec(NumElems);
14399   for (unsigned i = 0; i != NumElems; ++i)
14400     MaskVec[i] = (i == IdxVal) ? NumElems : i;
14401
14402   return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14403 }
14404
14405 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14406                                                   SelectionDAG &DAG) const {
14407   MVT VT = Op.getSimpleValueType();
14408   MVT EltVT = VT.getVectorElementType();
14409   unsigned NumElts = VT.getVectorNumElements();
14410
14411   if (EltVT == MVT::i1)
14412     return InsertBitToMaskVector(Op, DAG);
14413
14414   SDLoc dl(Op);
14415   SDValue N0 = Op.getOperand(0);
14416   SDValue N1 = Op.getOperand(1);
14417   SDValue N2 = Op.getOperand(2);
14418   if (!isa<ConstantSDNode>(N2))
14419     return SDValue();
14420   auto *N2C = cast<ConstantSDNode>(N2);
14421   unsigned IdxVal = N2C->getZExtValue();
14422
14423   bool IsZeroElt = X86::isZeroNode(N1);
14424   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14425
14426   // If we are inserting a element, see if we can do this more efficiently with
14427   // a blend shuffle with a rematerializable vector than a costly integer
14428   // insertion.
14429   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14430       16 <= EltVT.getSizeInBits()) {
14431     SmallVector<int, 8> BlendMask;
14432     for (unsigned i = 0; i != NumElts; ++i)
14433       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14434     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14435                                   : DAG.getConstant(-1, dl, VT);
14436     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14437   }
14438
14439   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14440   // into that, and then insert the subvector back into the result.
14441   if (VT.is256BitVector() || VT.is512BitVector()) {
14442     // With a 256-bit vector, we can insert into the zero element efficiently
14443     // using a blend if we have AVX or AVX2 and the right data type.
14444     if (VT.is256BitVector() && IdxVal == 0) {
14445       // TODO: It is worthwhile to cast integer to floating point and back
14446       // and incur a domain crossing penalty if that's what we'll end up
14447       // doing anyway after extracting to a 128-bit vector.
14448       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14449           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14450         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14451         N2 = DAG.getIntPtrConstant(1, dl);
14452         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14453       }
14454     }
14455
14456     // Get the desired 128-bit vector chunk.
14457     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14458
14459     // Insert the element into the desired chunk.
14460     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14461     assert(isPowerOf2_32(NumEltsIn128));
14462     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14463     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14464
14465     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14466                     DAG.getConstant(IdxIn128, dl, MVT::i32));
14467
14468     // Insert the changed part back into the bigger vector
14469     return insert128BitVector(N0, V, IdxVal, DAG, dl);
14470   }
14471   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14472
14473   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14474   // argument. SSE41 required for pinsrb.
14475   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14476     unsigned Opc;
14477     if (VT == MVT::v8i16) {
14478       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14479       Opc = X86ISD::PINSRW;
14480     } else {
14481       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14482       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14483       Opc = X86ISD::PINSRB;
14484     }
14485
14486     if (N1.getValueType() != MVT::i32)
14487       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14488     if (N2.getValueType() != MVT::i32)
14489       N2 = DAG.getIntPtrConstant(IdxVal, dl);
14490     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14491   }
14492
14493   if (Subtarget.hasSSE41()) {
14494     if (EltVT == MVT::f32) {
14495       // Bits [7:6] of the constant are the source select. This will always be
14496       //   zero here. The DAG Combiner may combine an extract_elt index into
14497       //   these bits. For example (insert (extract, 3), 2) could be matched by
14498       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14499       // Bits [5:4] of the constant are the destination select. This is the
14500       //   value of the incoming immediate.
14501       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14502       //   combine either bitwise AND or insert of float 0.0 to set these bits.
14503
14504       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14505       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14506         // If this is an insertion of 32-bits into the low 32-bits of
14507         // a vector, we prefer to generate a blend with immediate rather
14508         // than an insertps. Blends are simpler operations in hardware and so
14509         // will always have equal or better performance than insertps.
14510         // But if optimizing for size and there's a load folding opportunity,
14511         // generate insertps because blendps does not have a 32-bit memory
14512         // operand form.
14513         N2 = DAG.getIntPtrConstant(1, dl);
14514         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14515         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14516       }
14517       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14518       // Create this as a scalar to vector..
14519       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14520       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14521     }
14522
14523     // PINSR* works with constant index.
14524     if (EltVT == MVT::i32 || EltVT == MVT::i64)
14525       return Op;
14526   }
14527
14528   return SDValue();
14529 }
14530
14531 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14532                                      SelectionDAG &DAG) {
14533   SDLoc dl(Op);
14534   MVT OpVT = Op.getSimpleValueType();
14535
14536   // It's always cheaper to replace a xor+movd with xorps and simplifies further
14537   // combines.
14538   if (X86::isZeroNode(Op.getOperand(0)))
14539     return getZeroVector(OpVT, Subtarget, DAG, dl);
14540
14541   // If this is a 256-bit vector result, first insert into a 128-bit
14542   // vector and then insert into the 256-bit vector.
14543   if (!OpVT.is128BitVector()) {
14544     // Insert into a 128-bit vector.
14545     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14546     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14547                                  OpVT.getVectorNumElements() / SizeFactor);
14548
14549     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14550
14551     // Insert the 128-bit vector.
14552     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14553   }
14554   assert(OpVT.is128BitVector() && "Expected an SSE type!");
14555
14556   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14557   if (OpVT == MVT::v4i32)
14558     return Op;
14559
14560   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14561   return DAG.getBitcast(
14562       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14563 }
14564
14565 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
14566 // a simple subregister reference or explicit instructions to grab
14567 // upper bits of a vector.
14568 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14569                                       SelectionDAG &DAG) {
14570   assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14571
14572   SDLoc dl(Op);
14573   SDValue In =  Op.getOperand(0);
14574   SDValue Idx = Op.getOperand(1);
14575   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14576   MVT ResVT = Op.getSimpleValueType();
14577
14578   // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
14579   // would result with: v1i1 = extract_subvector(vXi1, idx).
14580   // Lower these into extract_vector_elt which is already selectable.
14581   if (ResVT == MVT::v1i1) {
14582     assert(Subtarget.hasAVX512() &&
14583            "Boolean EXTRACT_SUBVECTOR requires AVX512");
14584
14585     MVT EltVT = ResVT.getVectorElementType();
14586     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14587     MVT LegalVT =
14588         (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
14589     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
14590     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
14591   }
14592
14593   assert((In.getSimpleValueType().is256BitVector() ||
14594           In.getSimpleValueType().is512BitVector()) &&
14595          "Can only extract from 256-bit or 512-bit vectors");
14596
14597   // If the input is a buildvector just emit a smaller one.
14598   unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14599   if (In.getOpcode() == ISD::BUILD_VECTOR)
14600     return DAG.getBuildVector(
14601         ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14602
14603   // Everything else is legal.
14604   return Op;
14605 }
14606
14607 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
14608 // simple superregister reference or explicit instructions to insert
14609 // the upper bits of a vector.
14610 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14611                                      SelectionDAG &DAG) {
14612   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14613
14614   return insert1BitVector(Op, DAG, Subtarget);
14615 }
14616
14617 // Returns the appropriate wrapper opcode for a global reference.
14618 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14619   // References to absolute symbols are never PC-relative.
14620   if (GV && GV->isAbsoluteSymbolRef())
14621     return X86ISD::Wrapper;
14622
14623   CodeModel::Model M = getTargetMachine().getCodeModel();
14624   if (Subtarget.isPICStyleRIPRel() &&
14625       (M == CodeModel::Small || M == CodeModel::Kernel))
14626     return X86ISD::WrapperRIP;
14627
14628   return X86ISD::Wrapper;
14629 }
14630
14631 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14632 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14633 // one of the above mentioned nodes. It has to be wrapped because otherwise
14634 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14635 // be used to form addressing mode. These wrapped nodes will be selected
14636 // into MOV32ri.
14637 SDValue
14638 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14639   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14640
14641   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14642   // global base reg.
14643   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14644
14645   auto PtrVT = getPointerTy(DAG.getDataLayout());
14646   SDValue Result = DAG.getTargetConstantPool(
14647       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14648   SDLoc DL(CP);
14649   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14650   // With PIC, the address is actually $g + Offset.
14651   if (OpFlag) {
14652     Result =
14653         DAG.getNode(ISD::ADD, DL, PtrVT,
14654                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14655   }
14656
14657   return Result;
14658 }
14659
14660 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14661   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14662
14663   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14664   // global base reg.
14665   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14666
14667   auto PtrVT = getPointerTy(DAG.getDataLayout());
14668   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14669   SDLoc DL(JT);
14670   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14671
14672   // With PIC, the address is actually $g + Offset.
14673   if (OpFlag)
14674     Result =
14675         DAG.getNode(ISD::ADD, DL, PtrVT,
14676                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14677
14678   return Result;
14679 }
14680
14681 SDValue
14682 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14683   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14684
14685   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14686   // global base reg.
14687   const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14688   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14689
14690   auto PtrVT = getPointerTy(DAG.getDataLayout());
14691   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14692
14693   SDLoc DL(Op);
14694   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14695
14696   // With PIC, the address is actually $g + Offset.
14697   if (isPositionIndependent() && !Subtarget.is64Bit()) {
14698     Result =
14699         DAG.getNode(ISD::ADD, DL, PtrVT,
14700                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14701   }
14702
14703   // For symbols that require a load from a stub to get the address, emit the
14704   // load.
14705   if (isGlobalStubReference(OpFlag))
14706     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14707                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14708
14709   return Result;
14710 }
14711
14712 SDValue
14713 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14714   // Create the TargetBlockAddressAddress node.
14715   unsigned char OpFlags =
14716     Subtarget.classifyBlockAddressReference();
14717   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14718   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14719   SDLoc dl(Op);
14720   auto PtrVT = getPointerTy(DAG.getDataLayout());
14721   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14722   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14723
14724   // With PIC, the address is actually $g + Offset.
14725   if (isGlobalRelativeToPICBase(OpFlags)) {
14726     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14727                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14728   }
14729
14730   return Result;
14731 }
14732
14733 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14734                                               const SDLoc &dl, int64_t Offset,
14735                                               SelectionDAG &DAG) const {
14736   // Create the TargetGlobalAddress node, folding in the constant
14737   // offset if it is legal.
14738   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14739   CodeModel::Model M = DAG.getTarget().getCodeModel();
14740   auto PtrVT = getPointerTy(DAG.getDataLayout());
14741   SDValue Result;
14742   if (OpFlags == X86II::MO_NO_FLAG &&
14743       X86::isOffsetSuitableForCodeModel(Offset, M)) {
14744     // A direct static reference to a global.
14745     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14746     Offset = 0;
14747   } else {
14748     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14749   }
14750
14751   Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14752
14753   // With PIC, the address is actually $g + Offset.
14754   if (isGlobalRelativeToPICBase(OpFlags)) {
14755     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14756                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14757   }
14758
14759   // For globals that require a load from a stub to get the address, emit the
14760   // load.
14761   if (isGlobalStubReference(OpFlags))
14762     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14763                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14764
14765   // If there was a non-zero offset that we didn't fold, create an explicit
14766   // addition for it.
14767   if (Offset != 0)
14768     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14769                          DAG.getConstant(Offset, dl, PtrVT));
14770
14771   return Result;
14772 }
14773
14774 SDValue
14775 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14776   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14777   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14778   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14779 }
14780
14781 static SDValue
14782 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14783            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14784            unsigned char OperandFlags, bool LocalDynamic = false) {
14785   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14786   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14787   SDLoc dl(GA);
14788   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14789                                            GA->getValueType(0),
14790                                            GA->getOffset(),
14791                                            OperandFlags);
14792
14793   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14794                                            : X86ISD::TLSADDR;
14795
14796   if (InFlag) {
14797     SDValue Ops[] = { Chain,  TGA, *InFlag };
14798     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14799   } else {
14800     SDValue Ops[]  = { Chain, TGA };
14801     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14802   }
14803
14804   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14805   MFI.setAdjustsStack(true);
14806   MFI.setHasCalls(true);
14807
14808   SDValue Flag = Chain.getValue(1);
14809   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14810 }
14811
14812 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14813 static SDValue
14814 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14815                                 const EVT PtrVT) {
14816   SDValue InFlag;
14817   SDLoc dl(GA);  // ? function entry point might be better
14818   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14819                                    DAG.getNode(X86ISD::GlobalBaseReg,
14820                                                SDLoc(), PtrVT), InFlag);
14821   InFlag = Chain.getValue(1);
14822
14823   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14824 }
14825
14826 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14827 static SDValue
14828 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14829                                 const EVT PtrVT) {
14830   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14831                     X86::RAX, X86II::MO_TLSGD);
14832 }
14833
14834 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14835                                            SelectionDAG &DAG,
14836                                            const EVT PtrVT,
14837                                            bool is64Bit) {
14838   SDLoc dl(GA);
14839
14840   // Get the start address of the TLS block for this module.
14841   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14842       .getInfo<X86MachineFunctionInfo>();
14843   MFI->incNumLocalDynamicTLSAccesses();
14844
14845   SDValue Base;
14846   if (is64Bit) {
14847     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14848                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
14849   } else {
14850     SDValue InFlag;
14851     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14852         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14853     InFlag = Chain.getValue(1);
14854     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14855                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14856   }
14857
14858   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14859   // of Base.
14860
14861   // Build x@dtpoff.
14862   unsigned char OperandFlags = X86II::MO_DTPOFF;
14863   unsigned WrapperKind = X86ISD::Wrapper;
14864   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14865                                            GA->getValueType(0),
14866                                            GA->getOffset(), OperandFlags);
14867   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14868
14869   // Add x@dtpoff with the base.
14870   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14871 }
14872
14873 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14874 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14875                                    const EVT PtrVT, TLSModel::Model model,
14876                                    bool is64Bit, bool isPIC) {
14877   SDLoc dl(GA);
14878
14879   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14880   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14881                                                          is64Bit ? 257 : 256));
14882
14883   SDValue ThreadPointer =
14884       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14885                   MachinePointerInfo(Ptr));
14886
14887   unsigned char OperandFlags = 0;
14888   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
14889   // initialexec.
14890   unsigned WrapperKind = X86ISD::Wrapper;
14891   if (model == TLSModel::LocalExec) {
14892     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14893   } else if (model == TLSModel::InitialExec) {
14894     if (is64Bit) {
14895       OperandFlags = X86II::MO_GOTTPOFF;
14896       WrapperKind = X86ISD::WrapperRIP;
14897     } else {
14898       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14899     }
14900   } else {
14901     llvm_unreachable("Unexpected model");
14902   }
14903
14904   // emit "addl x@ntpoff,%eax" (local exec)
14905   // or "addl x@indntpoff,%eax" (initial exec)
14906   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14907   SDValue TGA =
14908       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14909                                  GA->getOffset(), OperandFlags);
14910   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14911
14912   if (model == TLSModel::InitialExec) {
14913     if (isPIC && !is64Bit) {
14914       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14915                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14916                            Offset);
14917     }
14918
14919     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14920                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14921   }
14922
14923   // The address of the thread local variable is the add of the thread
14924   // pointer with the offset of the variable.
14925   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14926 }
14927
14928 SDValue
14929 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14930
14931   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14932
14933   if (DAG.getTarget().Options.EmulatedTLS)
14934     return LowerToTLSEmulatedModel(GA, DAG);
14935
14936   const GlobalValue *GV = GA->getGlobal();
14937   auto PtrVT = getPointerTy(DAG.getDataLayout());
14938   bool PositionIndependent = isPositionIndependent();
14939
14940   if (Subtarget.isTargetELF()) {
14941     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14942     switch (model) {
14943       case TLSModel::GeneralDynamic:
14944         if (Subtarget.is64Bit())
14945           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14946         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14947       case TLSModel::LocalDynamic:
14948         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14949                                            Subtarget.is64Bit());
14950       case TLSModel::InitialExec:
14951       case TLSModel::LocalExec:
14952         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14953                                    PositionIndependent);
14954     }
14955     llvm_unreachable("Unknown TLS model.");
14956   }
14957
14958   if (Subtarget.isTargetDarwin()) {
14959     // Darwin only has one model of TLS.  Lower to that.
14960     unsigned char OpFlag = 0;
14961     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14962                            X86ISD::WrapperRIP : X86ISD::Wrapper;
14963
14964     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14965     // global base reg.
14966     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14967     if (PIC32)
14968       OpFlag = X86II::MO_TLVP_PIC_BASE;
14969     else
14970       OpFlag = X86II::MO_TLVP;
14971     SDLoc DL(Op);
14972     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14973                                                 GA->getValueType(0),
14974                                                 GA->getOffset(), OpFlag);
14975     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14976
14977     // With PIC32, the address is actually $g + Offset.
14978     if (PIC32)
14979       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14980                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14981                            Offset);
14982
14983     // Lowering the machine isd will make sure everything is in the right
14984     // location.
14985     SDValue Chain = DAG.getEntryNode();
14986     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14987     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14988     SDValue Args[] = { Chain, Offset };
14989     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14990     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14991                                DAG.getIntPtrConstant(0, DL, true),
14992                                Chain.getValue(1), DL);
14993
14994     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14995     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14996     MFI.setAdjustsStack(true);
14997
14998     // And our return value (tls address) is in the standard call return value
14999     // location.
15000     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15001     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15002   }
15003
15004   if (Subtarget.isTargetKnownWindowsMSVC() ||
15005       Subtarget.isTargetWindowsItanium() ||
15006       Subtarget.isTargetWindowsGNU()) {
15007     // Just use the implicit TLS architecture
15008     // Need to generate something similar to:
15009     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15010     //                                  ; from TEB
15011     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
15012     //   mov     rcx, qword [rdx+rcx*8]
15013     //   mov     eax, .tls$:tlsvar
15014     //   [rax+rcx] contains the address
15015     // Windows 64bit: gs:0x58
15016     // Windows 32bit: fs:__tls_array
15017
15018     SDLoc dl(GA);
15019     SDValue Chain = DAG.getEntryNode();
15020
15021     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15022     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15023     // use its literal value of 0x2C.
15024     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15025                                         ? Type::getInt8PtrTy(*DAG.getContext(),
15026                                                              256)
15027                                         : Type::getInt32PtrTy(*DAG.getContext(),
15028                                                               257));
15029
15030     SDValue TlsArray = Subtarget.is64Bit()
15031                            ? DAG.getIntPtrConstant(0x58, dl)
15032                            : (Subtarget.isTargetWindowsGNU()
15033                                   ? DAG.getIntPtrConstant(0x2C, dl)
15034                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
15035
15036     SDValue ThreadPointer =
15037         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15038
15039     SDValue res;
15040     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15041       res = ThreadPointer;
15042     } else {
15043       // Load the _tls_index variable
15044       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15045       if (Subtarget.is64Bit())
15046         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15047                              MachinePointerInfo(), MVT::i32);
15048       else
15049         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15050
15051       auto &DL = DAG.getDataLayout();
15052       SDValue Scale =
15053           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15054       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15055
15056       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15057     }
15058
15059     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15060
15061     // Get the offset of start of .tls section
15062     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15063                                              GA->getValueType(0),
15064                                              GA->getOffset(), X86II::MO_SECREL);
15065     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15066
15067     // The address of the thread local variable is the add of the thread
15068     // pointer with the offset of the variable.
15069     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15070   }
15071
15072   llvm_unreachable("TLS not implemented for this target.");
15073 }
15074
15075 /// Lower SRA_PARTS and friends, which return two i32 values
15076 /// and take a 2 x i32 value to shift plus a shift amount.
15077 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15078   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15079   MVT VT = Op.getSimpleValueType();
15080   unsigned VTBits = VT.getSizeInBits();
15081   SDLoc dl(Op);
15082   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15083   SDValue ShOpLo = Op.getOperand(0);
15084   SDValue ShOpHi = Op.getOperand(1);
15085   SDValue ShAmt  = Op.getOperand(2);
15086   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15087   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15088   // during isel.
15089   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15090                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
15091   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15092                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
15093                        : DAG.getConstant(0, dl, VT);
15094
15095   SDValue Tmp2, Tmp3;
15096   if (Op.getOpcode() == ISD::SHL_PARTS) {
15097     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15098     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15099   } else {
15100     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15101     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15102   }
15103
15104   // If the shift amount is larger or equal than the width of a part we can't
15105   // rely on the results of shld/shrd. Insert a test and select the appropriate
15106   // values for large shift amounts.
15107   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15108                                 DAG.getConstant(VTBits, dl, MVT::i8));
15109   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15110                              AndNode, DAG.getConstant(0, dl, MVT::i8));
15111
15112   SDValue Hi, Lo;
15113   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15114   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15115   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15116
15117   if (Op.getOpcode() == ISD::SHL_PARTS) {
15118     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15119     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15120   } else {
15121     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15122     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15123   }
15124
15125   SDValue Ops[2] = { Lo, Hi };
15126   return DAG.getMergeValues(Ops, dl);
15127 }
15128
15129 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15130                                            SelectionDAG &DAG) const {
15131   SDValue Src = Op.getOperand(0);
15132   MVT SrcVT = Src.getSimpleValueType();
15133   MVT VT = Op.getSimpleValueType();
15134   SDLoc dl(Op);
15135
15136   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15137   if (SrcVT.isVector()) {
15138     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15139       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15140                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15141                                      DAG.getUNDEF(SrcVT)));
15142     }
15143     if (SrcVT.getVectorElementType() == MVT::i1) {
15144       if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15145         return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15146                            DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15147       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15148       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15149                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15150     }
15151     return SDValue();
15152   }
15153
15154   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15155          "Unknown SINT_TO_FP to lower!");
15156
15157   // These are really Legal; return the operand so the caller accepts it as
15158   // Legal.
15159   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15160     return Op;
15161   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15162       Subtarget.is64Bit()) {
15163     return Op;
15164   }
15165
15166   SDValue ValueToStore = Op.getOperand(0);
15167   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15168       !Subtarget.is64Bit())
15169     // Bitcasting to f64 here allows us to do a single 64-bit store from
15170     // an SSE register, avoiding the store forwarding penalty that would come
15171     // with two 32-bit stores.
15172     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15173
15174   unsigned Size = SrcVT.getSizeInBits()/8;
15175   MachineFunction &MF = DAG.getMachineFunction();
15176   auto PtrVT = getPointerTy(MF.getDataLayout());
15177   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15178   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15179   SDValue Chain = DAG.getStore(
15180       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15181       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15182   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15183 }
15184
15185 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15186                                      SDValue StackSlot,
15187                                      SelectionDAG &DAG) const {
15188   // Build the FILD
15189   SDLoc DL(Op);
15190   SDVTList Tys;
15191   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15192   if (useSSE)
15193     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15194   else
15195     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15196
15197   unsigned ByteSize = SrcVT.getSizeInBits()/8;
15198
15199   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15200   MachineMemOperand *MMO;
15201   if (FI) {
15202     int SSFI = FI->getIndex();
15203     MMO = DAG.getMachineFunction().getMachineMemOperand(
15204         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15205         MachineMemOperand::MOLoad, ByteSize, ByteSize);
15206   } else {
15207     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15208     StackSlot = StackSlot.getOperand(1);
15209   }
15210   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15211   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15212                                            X86ISD::FILD, DL,
15213                                            Tys, Ops, SrcVT, MMO);
15214
15215   if (useSSE) {
15216     Chain = Result.getValue(1);
15217     SDValue InFlag = Result.getValue(2);
15218
15219     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15220     // shouldn't be necessary except that RFP cannot be live across
15221     // multiple blocks. When stackifier is fixed, they can be uncoupled.
15222     MachineFunction &MF = DAG.getMachineFunction();
15223     unsigned SSFISize = Op.getValueSizeInBits()/8;
15224     int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15225     auto PtrVT = getPointerTy(MF.getDataLayout());
15226     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15227     Tys = DAG.getVTList(MVT::Other);
15228     SDValue Ops[] = {
15229       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15230     };
15231     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15232         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15233         MachineMemOperand::MOStore, SSFISize, SSFISize);
15234
15235     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15236                                     Ops, Op.getValueType(), MMO);
15237     Result = DAG.getLoad(
15238         Op.getValueType(), DL, Chain, StackSlot,
15239         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15240   }
15241
15242   return Result;
15243 }
15244
15245 /// 64-bit unsigned integer to double expansion.
15246 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15247                                                SelectionDAG &DAG) const {
15248   // This algorithm is not obvious. Here it is what we're trying to output:
15249   /*
15250      movq       %rax,  %xmm0
15251      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15252      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15253      #ifdef __SSE3__
15254        haddpd   %xmm0, %xmm0
15255      #else
15256        pshufd   $0x4e, %xmm0, %xmm1
15257        addpd    %xmm1, %xmm0
15258      #endif
15259   */
15260
15261   SDLoc dl(Op);
15262   LLVMContext *Context = DAG.getContext();
15263
15264   // Build some magic constants.
15265   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15266   Constant *C0 = ConstantDataVector::get(*Context, CV0);
15267   auto PtrVT = getPointerTy(DAG.getDataLayout());
15268   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15269
15270   SmallVector<Constant*,2> CV1;
15271   CV1.push_back(
15272     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15273                                       APInt(64, 0x4330000000000000ULL))));
15274   CV1.push_back(
15275     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15276                                       APInt(64, 0x4530000000000000ULL))));
15277   Constant *C1 = ConstantVector::get(CV1);
15278   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15279
15280   // Load the 64-bit value into an XMM register.
15281   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15282                             Op.getOperand(0));
15283   SDValue CLod0 =
15284       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15285                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15286                   /* Alignment = */ 16);
15287   SDValue Unpck1 =
15288       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15289
15290   SDValue CLod1 =
15291       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15292                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15293                   /* Alignment = */ 16);
15294   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15295   // TODO: Are there any fast-math-flags to propagate here?
15296   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15297   SDValue Result;
15298
15299   if (Subtarget.hasSSE3()) {
15300     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15301     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15302   } else {
15303     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15304     SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15305     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15306                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15307   }
15308
15309   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15310                      DAG.getIntPtrConstant(0, dl));
15311 }
15312
15313 /// 32-bit unsigned integer to float expansion.
15314 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15315                                                SelectionDAG &DAG) const {
15316   SDLoc dl(Op);
15317   // FP constant to bias correct the final result.
15318   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15319                                    MVT::f64);
15320
15321   // Load the 32-bit value into an XMM register.
15322   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15323                              Op.getOperand(0));
15324
15325   // Zero out the upper parts of the register.
15326   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15327
15328   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15329                      DAG.getBitcast(MVT::v2f64, Load),
15330                      DAG.getIntPtrConstant(0, dl));
15331
15332   // Or the load with the bias.
15333   SDValue Or = DAG.getNode(
15334       ISD::OR, dl, MVT::v2i64,
15335       DAG.getBitcast(MVT::v2i64,
15336                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15337       DAG.getBitcast(MVT::v2i64,
15338                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15339   Or =
15340       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15341                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15342
15343   // Subtract the bias.
15344   // TODO: Are there any fast-math-flags to propagate here?
15345   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15346
15347   // Handle final rounding.
15348   MVT DestVT = Op.getSimpleValueType();
15349
15350   if (DestVT.bitsLT(MVT::f64))
15351     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15352                        DAG.getIntPtrConstant(0, dl));
15353   if (DestVT.bitsGT(MVT::f64))
15354     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15355
15356   // Handle final rounding.
15357   return Sub;
15358 }
15359
15360 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15361                                      const X86Subtarget &Subtarget, SDLoc &DL) {
15362   if (Op.getSimpleValueType() != MVT::v2f64)
15363     return SDValue();
15364
15365   SDValue N0 = Op.getOperand(0);
15366   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15367
15368   // Legalize to v4i32 type.
15369   N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15370                    DAG.getUNDEF(MVT::v2i32));
15371
15372   if (Subtarget.hasAVX512())
15373     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15374
15375   // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15376   // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15377   SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15378   SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15379
15380   // Two to the power of half-word-size.
15381   SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15382
15383   // Clear upper part of LO, lower HI.
15384   SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15385   SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15386
15387   SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15388           fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15389   SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15390
15391   // Add the two halves.
15392   return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15393 }
15394
15395 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15396                                      const X86Subtarget &Subtarget) {
15397   // The algorithm is the following:
15398   // #ifdef __SSE4_1__
15399   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15400   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15401   //                                 (uint4) 0x53000000, 0xaa);
15402   // #else
15403   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15404   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15405   // #endif
15406   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15407   //     return (float4) lo + fhi;
15408
15409   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15410   // reassociate the two FADDs, and if we do that, the algorithm fails
15411   // spectacularly (PR24512).
15412   // FIXME: If we ever have some kind of Machine FMF, this should be marked
15413   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15414   // there's also the MachineCombiner reassociations happening on Machine IR.
15415   if (DAG.getTarget().Options.UnsafeFPMath)
15416     return SDValue();
15417
15418   SDLoc DL(Op);
15419   SDValue V = Op->getOperand(0);
15420   MVT VecIntVT = V.getSimpleValueType();
15421   bool Is128 = VecIntVT == MVT::v4i32;
15422   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15423   // If we convert to something else than the supported type, e.g., to v4f64,
15424   // abort early.
15425   if (VecFloatVT != Op->getSimpleValueType(0))
15426     return SDValue();
15427
15428   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15429          "Unsupported custom type");
15430
15431   // In the #idef/#else code, we have in common:
15432   // - The vector of constants:
15433   // -- 0x4b000000
15434   // -- 0x53000000
15435   // - A shift:
15436   // -- v >> 16
15437
15438   // Create the splat vector for 0x4b000000.
15439   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15440   // Create the splat vector for 0x53000000.
15441   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15442
15443   // Create the right shift.
15444   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15445   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15446
15447   SDValue Low, High;
15448   if (Subtarget.hasSSE41()) {
15449     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15450     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15451     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15452     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15453     // Low will be bitcasted right away, so do not bother bitcasting back to its
15454     // original type.
15455     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15456                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15457     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15458     //                                 (uint4) 0x53000000, 0xaa);
15459     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15460     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15461     // High will be bitcasted right away, so do not bother bitcasting back to
15462     // its original type.
15463     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15464                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15465   } else {
15466     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15467     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15468     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15469     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15470
15471     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15472     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15473   }
15474
15475   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15476   SDValue VecCstFAdd = DAG.getConstantFP(
15477       APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15478
15479   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15480   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15481   // TODO: Are there any fast-math-flags to propagate here?
15482   SDValue FHigh =
15483       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15484   //     return (float4) lo + fhi;
15485   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15486   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15487 }
15488
15489 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15490                                                SelectionDAG &DAG) const {
15491   SDValue N0 = Op.getOperand(0);
15492   MVT SrcVT = N0.getSimpleValueType();
15493   SDLoc dl(Op);
15494
15495   if (SrcVT.getVectorElementType() == MVT::i1) {
15496     if (SrcVT == MVT::v2i1)
15497       return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15498                          DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15499     MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15500     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15501                        DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15502   }
15503
15504   switch (SrcVT.SimpleTy) {
15505   default:
15506     llvm_unreachable("Custom UINT_TO_FP is not supported!");
15507   case MVT::v4i8:
15508   case MVT::v4i16:
15509   case MVT::v8i8:
15510   case MVT::v8i16: {
15511     MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15512     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15513                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15514   }
15515   case MVT::v2i32:
15516     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15517   case MVT::v4i32:
15518   case MVT::v8i32:
15519     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15520   case MVT::v16i8:
15521   case MVT::v16i16:
15522     assert(Subtarget.hasAVX512());
15523     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15524                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15525   }
15526 }
15527
15528 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15529                                            SelectionDAG &DAG) const {
15530   SDValue N0 = Op.getOperand(0);
15531   SDLoc dl(Op);
15532   auto PtrVT = getPointerTy(DAG.getDataLayout());
15533
15534   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15535   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15536   // the optimization here.
15537   if (DAG.SignBitIsZero(N0))
15538     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15539
15540   if (Op.getSimpleValueType().isVector())
15541     return lowerUINT_TO_FP_vec(Op, DAG);
15542
15543   MVT SrcVT = N0.getSimpleValueType();
15544   MVT DstVT = Op.getSimpleValueType();
15545
15546   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15547       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15548     // Conversions from unsigned i32 to f32/f64 are legal,
15549     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
15550     return Op;
15551   }
15552
15553   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15554     return LowerUINT_TO_FP_i64(Op, DAG);
15555   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15556     return LowerUINT_TO_FP_i32(Op, DAG);
15557   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15558     return SDValue();
15559
15560   // Make a 64-bit buffer, and use it to build an FILD.
15561   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15562   if (SrcVT == MVT::i32) {
15563     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15564     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15565                                   StackSlot, MachinePointerInfo());
15566     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15567                                   OffsetSlot, MachinePointerInfo());
15568     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15569     return Fild;
15570   }
15571
15572   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15573   SDValue ValueToStore = Op.getOperand(0);
15574   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15575     // Bitcasting to f64 here allows us to do a single 64-bit store from
15576     // an SSE register, avoiding the store forwarding penalty that would come
15577     // with two 32-bit stores.
15578     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15579   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15580                                MachinePointerInfo());
15581   // For i64 source, we need to add the appropriate power of 2 if the input
15582   // was negative.  This is the same as the optimization in
15583   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15584   // we must be careful to do the computation in x87 extended precision, not
15585   // in SSE. (The generic code can't know it's OK to do this, or how to.)
15586   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15587   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15588       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15589       MachineMemOperand::MOLoad, 8, 8);
15590
15591   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15592   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15593   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15594                                          MVT::i64, MMO);
15595
15596   APInt FF(32, 0x5F800000ULL);
15597
15598   // Check whether the sign bit is set.
15599   SDValue SignSet = DAG.getSetCC(
15600       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15601       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15602
15603   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15604   SDValue FudgePtr = DAG.getConstantPool(
15605       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15606
15607   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15608   SDValue Zero = DAG.getIntPtrConstant(0, dl);
15609   SDValue Four = DAG.getIntPtrConstant(4, dl);
15610   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15611   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15612
15613   // Load the value out, extending it from f32 to f80.
15614   // FIXME: Avoid the extend by constructing the right constant pool?
15615   SDValue Fudge = DAG.getExtLoad(
15616       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15617       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15618       /* Alignment = */ 4);
15619   // Extend everything to 80 bits to force it to be done on x87.
15620   // TODO: Are there any fast-math-flags to propagate here?
15621   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15622   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15623                      DAG.getIntPtrConstant(0, dl));
15624 }
15625
15626 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15627 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15628 // just return an <SDValue(), SDValue()> pair.
15629 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15630 // to i16, i32 or i64, and we lower it to a legal sequence.
15631 // If lowered to the final integer result we return a <result, SDValue()> pair.
15632 // Otherwise we lower it to a sequence ending with a FIST, return a
15633 // <FIST, StackSlot> pair, and the caller is responsible for loading
15634 // the final integer result from StackSlot.
15635 std::pair<SDValue,SDValue>
15636 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15637                                    bool IsSigned, bool IsReplace) const {
15638   SDLoc DL(Op);
15639
15640   EVT DstTy = Op.getValueType();
15641   EVT TheVT = Op.getOperand(0).getValueType();
15642   auto PtrVT = getPointerTy(DAG.getDataLayout());
15643
15644   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15645     // f16 must be promoted before using the lowering in this routine.
15646     // fp128 does not use this lowering.
15647     return std::make_pair(SDValue(), SDValue());
15648   }
15649
15650   // If using FIST to compute an unsigned i64, we'll need some fixup
15651   // to handle values above the maximum signed i64.  A FIST is always
15652   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15653   bool UnsignedFixup = !IsSigned &&
15654                        DstTy == MVT::i64 &&
15655                        (!Subtarget.is64Bit() ||
15656                         !isScalarFPTypeInSSEReg(TheVT));
15657
15658   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15659     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15660     // The low 32 bits of the fist result will have the correct uint32 result.
15661     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15662     DstTy = MVT::i64;
15663   }
15664
15665   assert(DstTy.getSimpleVT() <= MVT::i64 &&
15666          DstTy.getSimpleVT() >= MVT::i16 &&
15667          "Unknown FP_TO_INT to lower!");
15668
15669   // These are really Legal.
15670   if (DstTy == MVT::i32 &&
15671       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15672     return std::make_pair(SDValue(), SDValue());
15673   if (Subtarget.is64Bit() &&
15674       DstTy == MVT::i64 &&
15675       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15676     return std::make_pair(SDValue(), SDValue());
15677
15678   // We lower FP->int64 into FISTP64 followed by a load from a temporary
15679   // stack slot.
15680   MachineFunction &MF = DAG.getMachineFunction();
15681   unsigned MemSize = DstTy.getSizeInBits()/8;
15682   int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15683   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15684
15685   unsigned Opc;
15686   switch (DstTy.getSimpleVT().SimpleTy) {
15687   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15688   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15689   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15690   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15691   }
15692
15693   SDValue Chain = DAG.getEntryNode();
15694   SDValue Value = Op.getOperand(0);
15695   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15696
15697   if (UnsignedFixup) {
15698     //
15699     // Conversion to unsigned i64 is implemented with a select,
15700     // depending on whether the source value fits in the range
15701     // of a signed i64.  Let Thresh be the FP equivalent of
15702     // 0x8000000000000000ULL.
15703     //
15704     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15705     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
15706     //  Fist-to-mem64 FistSrc
15707     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15708     //  to XOR'ing the high 32 bits with Adjust.
15709     //
15710     // Being a power of 2, Thresh is exactly representable in all FP formats.
15711     // For X87 we'd like to use the smallest FP type for this constant, but
15712     // for DAG type consistency we have to match the FP operand type.
15713
15714     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15715     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15716     bool LosesInfo = false;
15717     if (TheVT == MVT::f64)
15718       // The rounding mode is irrelevant as the conversion should be exact.
15719       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15720                               &LosesInfo);
15721     else if (TheVT == MVT::f80)
15722       Status = Thresh.convert(APFloat::x87DoubleExtended(),
15723                               APFloat::rmNearestTiesToEven, &LosesInfo);
15724
15725     assert(Status == APFloat::opOK && !LosesInfo &&
15726            "FP conversion should have been exact");
15727
15728     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15729
15730     SDValue Cmp = DAG.getSetCC(DL,
15731                                getSetCCResultType(DAG.getDataLayout(),
15732                                                   *DAG.getContext(), TheVT),
15733                                Value, ThreshVal, ISD::SETLT);
15734     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15735                            DAG.getConstant(0, DL, MVT::i32),
15736                            DAG.getConstant(0x80000000, DL, MVT::i32));
15737     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15738     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15739                                               *DAG.getContext(), TheVT),
15740                        Value, ThreshVal, ISD::SETLT);
15741     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15742   }
15743
15744   // FIXME This causes a redundant load/store if the SSE-class value is already
15745   // in memory, such as if it is on the callstack.
15746   if (isScalarFPTypeInSSEReg(TheVT)) {
15747     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15748     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15749                          MachinePointerInfo::getFixedStack(MF, SSFI));
15750     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15751     SDValue Ops[] = {
15752       Chain, StackSlot, DAG.getValueType(TheVT)
15753     };
15754
15755     MachineMemOperand *MMO =
15756         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15757                                 MachineMemOperand::MOLoad, MemSize, MemSize);
15758     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15759     Chain = Value.getValue(1);
15760     SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15761     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15762   }
15763
15764   MachineMemOperand *MMO =
15765       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15766                               MachineMemOperand::MOStore, MemSize, MemSize);
15767
15768   if (UnsignedFixup) {
15769
15770     // Insert the FIST, load its result as two i32's,
15771     // and XOR the high i32 with Adjust.
15772
15773     SDValue FistOps[] = { Chain, Value, StackSlot };
15774     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15775                                            FistOps, DstTy, MMO);
15776
15777     SDValue Low32 =
15778         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15779     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15780
15781     SDValue High32 =
15782         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15783     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15784
15785     if (Subtarget.is64Bit()) {
15786       // Join High32 and Low32 into a 64-bit result.
15787       // (High32 << 32) | Low32
15788       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15789       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15790       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15791                            DAG.getConstant(32, DL, MVT::i8));
15792       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15793       return std::make_pair(Result, SDValue());
15794     }
15795
15796     SDValue ResultOps[] = { Low32, High32 };
15797
15798     SDValue pair = IsReplace
15799       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15800       : DAG.getMergeValues(ResultOps, DL);
15801     return std::make_pair(pair, SDValue());
15802   } else {
15803     // Build the FP_TO_INT*_IN_MEM
15804     SDValue Ops[] = { Chain, Value, StackSlot };
15805     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15806                                            Ops, DstTy, MMO);
15807     return std::make_pair(FIST, StackSlot);
15808   }
15809 }
15810
15811 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15812                               const X86Subtarget &Subtarget) {
15813   MVT VT = Op->getSimpleValueType(0);
15814   SDValue In = Op->getOperand(0);
15815   MVT InVT = In.getSimpleValueType();
15816   SDLoc dl(Op);
15817
15818   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15819     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15820
15821   // Optimize vectors in AVX mode:
15822   //
15823   //   v8i16 -> v8i32
15824   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
15825   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
15826   //   Concat upper and lower parts.
15827   //
15828   //   v4i32 -> v4i64
15829   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
15830   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
15831   //   Concat upper and lower parts.
15832   //
15833
15834   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15835       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15836       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15837     return SDValue();
15838
15839   if (Subtarget.hasInt256())
15840     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15841
15842   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15843   SDValue Undef = DAG.getUNDEF(InVT);
15844   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15845   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15846   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15847
15848   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15849                              VT.getVectorNumElements()/2);
15850
15851   OpLo = DAG.getBitcast(HVT, OpLo);
15852   OpHi = DAG.getBitcast(HVT, OpHi);
15853
15854   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15855 }
15856
15857 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15858                   const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15859   MVT VT = Op->getSimpleValueType(0);
15860   SDValue In = Op->getOperand(0);
15861   MVT InVT = In.getSimpleValueType();
15862   SDLoc DL(Op);
15863   unsigned NumElts = VT.getVectorNumElements();
15864
15865   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15866       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15867     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15868
15869   if (InVT.getVectorElementType() != MVT::i1)
15870     return SDValue();
15871
15872   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15873   MVT ExtVT = VT;
15874   if (!VT.is512BitVector() && !Subtarget.hasVLX())
15875     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15876
15877   SDValue One =
15878    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15879   SDValue Zero =
15880    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15881
15882   SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15883   if (VT == ExtVT)
15884     return SelectedVal;
15885   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15886 }
15887
15888 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15889                                SelectionDAG &DAG) {
15890   if (Subtarget.hasFp256())
15891     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15892       return Res;
15893
15894   return SDValue();
15895 }
15896
15897 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15898                                 SelectionDAG &DAG) {
15899   SDLoc DL(Op);
15900   MVT VT = Op.getSimpleValueType();
15901   SDValue In = Op.getOperand(0);
15902   MVT SVT = In.getSimpleValueType();
15903
15904   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15905     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15906
15907   if (Subtarget.hasFp256())
15908     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15909       return Res;
15910
15911   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15912          VT.getVectorNumElements() != SVT.getVectorNumElements());
15913   return SDValue();
15914 }
15915
15916 /// Helper to recursively truncate vector elements in half with PACKSS.
15917 /// It makes use of the fact that vector comparison results will be all-zeros
15918 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15919 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15920 /// within each 128-bit lane.
15921 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15922                                                const SDLoc &DL,
15923                                                SelectionDAG &DAG,
15924                                                const X86Subtarget &Subtarget) {
15925   // Requires SSE2 but AVX512 has fast truncate.
15926   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15927     return SDValue();
15928
15929   EVT SrcVT = In.getValueType();
15930
15931   // No truncation required, we might get here due to recursive calls.
15932   if (SrcVT == DstVT)
15933     return In;
15934
15935   // We only support vector truncation to 128bits or greater from a
15936   // 256bits or greater source.
15937   if ((DstVT.getSizeInBits() % 128) != 0)
15938     return SDValue();
15939   if ((SrcVT.getSizeInBits() % 256) != 0)
15940     return SDValue();
15941
15942   unsigned NumElems = SrcVT.getVectorNumElements();
15943   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15944   assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15945
15946   EVT PackedSVT =
15947       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15948
15949   // Extract lower/upper subvectors.
15950   unsigned NumSubElts = NumElems / 2;
15951   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15952   SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15953   SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15954
15955   // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15956   if (SrcVT.is256BitVector()) {
15957     Lo = DAG.getBitcast(MVT::v8i16, Lo);
15958     Hi = DAG.getBitcast(MVT::v8i16, Hi);
15959     SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15960     return DAG.getBitcast(DstVT, Res);
15961   }
15962
15963   // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15964   // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15965   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15966     Lo = DAG.getBitcast(MVT::v16i16, Lo);
15967     Hi = DAG.getBitcast(MVT::v16i16, Hi);
15968     SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15969
15970     // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15971     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15972     Res = DAG.getBitcast(MVT::v4i64, Res);
15973     Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15974
15975     if (DstVT.is256BitVector())
15976       return DAG.getBitcast(DstVT, Res);
15977
15978     // If 512bit -> 128bit truncate another stage.
15979     EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15980     Res = DAG.getBitcast(PackedVT, Res);
15981     return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15982   }
15983
15984   // Recursively pack lower/upper subvectors, concat result and pack again.
15985   assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15986   EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15987   Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15988   Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15989
15990   PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15991   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15992   return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15993 }
15994
15995 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15996                                   const X86Subtarget &Subtarget) {
15997
15998   SDLoc DL(Op);
15999   MVT VT = Op.getSimpleValueType();
16000   SDValue In = Op.getOperand(0);
16001   MVT InVT = In.getSimpleValueType();
16002
16003   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16004
16005   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16006   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16007   if (InVT.getScalarSizeInBits() <= 16) {
16008     if (Subtarget.hasBWI()) {
16009       // legal, will go to VPMOVB2M, VPMOVW2M
16010       // Shift packed bytes not supported natively, bitcast to word
16011       MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16012       SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16013                                        DAG.getBitcast(ExtVT, In),
16014                                        DAG.getConstant(ShiftInx, DL, ExtVT));
16015       ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16016       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16017     }
16018     // Use TESTD/Q, extended vector to packed dword/qword.
16019     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16020            "Unexpected vector type.");
16021     unsigned NumElts = InVT.getVectorNumElements();
16022     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16023     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16024     InVT = ExtVT;
16025     ShiftInx = InVT.getScalarSizeInBits() - 1;
16026   }
16027
16028   SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16029                                    DAG.getConstant(ShiftInx, DL, InVT));
16030   return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16031 }
16032
16033 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16034   SDLoc DL(Op);
16035   MVT VT = Op.getSimpleValueType();
16036   SDValue In = Op.getOperand(0);
16037   MVT InVT = In.getSimpleValueType();
16038
16039   if (VT == MVT::i1) {
16040     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
16041            "Invalid scalar TRUNCATE operation");
16042     if (InVT.getSizeInBits() >= 32)
16043       return SDValue();
16044     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
16045     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16046   }
16047   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16048          "Invalid TRUNCATE operation");
16049
16050   if (VT.getVectorElementType() == MVT::i1)
16051     return LowerTruncateVecI1(Op, DAG, Subtarget);
16052
16053   // vpmovqb/w/d, vpmovdb/w, vpmovwb
16054   if (Subtarget.hasAVX512()) {
16055     // word to byte only under BWI
16056     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16057       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16058                          getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16059     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16060   }
16061
16062   // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
16063   if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
16064     if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
16065       return V;
16066
16067   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16068     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16069     if (Subtarget.hasInt256()) {
16070       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16071       In = DAG.getBitcast(MVT::v8i32, In);
16072       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16073       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16074                          DAG.getIntPtrConstant(0, DL));
16075     }
16076
16077     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16078                                DAG.getIntPtrConstant(0, DL));
16079     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16080                                DAG.getIntPtrConstant(2, DL));
16081     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16082     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16083     static const int ShufMask[] = {0, 2, 4, 6};
16084     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16085   }
16086
16087   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16088     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16089     if (Subtarget.hasInt256()) {
16090       In = DAG.getBitcast(MVT::v32i8, In);
16091
16092       // The PSHUFB mask:
16093       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
16094                                       -1, -1, -1, -1, -1, -1, -1, -1,
16095                                       16, 17, 20, 21, 24, 25, 28, 29,
16096                                       -1, -1, -1, -1, -1, -1, -1, -1 };
16097       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16098       In = DAG.getBitcast(MVT::v4i64, In);
16099
16100       static const int ShufMask2[] = {0,  2,  -1,  -1};
16101       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
16102       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16103                        DAG.getIntPtrConstant(0, DL));
16104       return DAG.getBitcast(VT, In);
16105     }
16106
16107     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16108                                DAG.getIntPtrConstant(0, DL));
16109
16110     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16111                                DAG.getIntPtrConstant(4, DL));
16112
16113     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16114     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16115
16116     // The PSHUFB mask:
16117     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
16118                                    -1, -1, -1, -1, -1, -1, -1, -1};
16119
16120     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16121     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16122
16123     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16124     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16125
16126     // The MOVLHPS Mask:
16127     static const int ShufMask2[] = {0, 1, 4, 5};
16128     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16129     return DAG.getBitcast(MVT::v8i16, res);
16130   }
16131
16132   // Handle truncation of V256 to V128 using shuffles.
16133   if (!VT.is128BitVector() || !InVT.is256BitVector())
16134     return SDValue();
16135
16136   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16137
16138   unsigned NumElems = VT.getVectorNumElements();
16139   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16140
16141   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16142   // Prepare truncation shuffle mask
16143   for (unsigned i = 0; i != NumElems; ++i)
16144     MaskVec[i] = i * 2;
16145   In = DAG.getBitcast(NVT, In);
16146   SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16147   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16148                      DAG.getIntPtrConstant(0, DL));
16149 }
16150
16151 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16152   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16153   MVT VT = Op.getSimpleValueType();
16154
16155   if (VT.isVector()) {
16156     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16157     SDValue Src = Op.getOperand(0);
16158     SDLoc dl(Op);
16159     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16160       return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16161                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16162                                      DAG.getUNDEF(MVT::v2f32)));
16163     }
16164
16165     return SDValue();
16166   }
16167
16168   assert(!VT.isVector());
16169
16170   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16171     IsSigned, /*IsReplace=*/ false);
16172   SDValue FIST = Vals.first, StackSlot = Vals.second;
16173   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16174   if (!FIST.getNode())
16175     return Op;
16176
16177   if (StackSlot.getNode())
16178     // Load the result.
16179     return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16180
16181   // The node is the result.
16182   return FIST;
16183 }
16184
16185 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16186   SDLoc DL(Op);
16187   MVT VT = Op.getSimpleValueType();
16188   SDValue In = Op.getOperand(0);
16189   MVT SVT = In.getSimpleValueType();
16190
16191   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16192
16193   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16194                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16195                                  In, DAG.getUNDEF(SVT)));
16196 }
16197
16198 /// The only differences between FABS and FNEG are the mask and the logic op.
16199 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16200 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16201   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16202          "Wrong opcode for lowering FABS or FNEG.");
16203
16204   bool IsFABS = (Op.getOpcode() == ISD::FABS);
16205
16206   // If this is a FABS and it has an FNEG user, bail out to fold the combination
16207   // into an FNABS. We'll lower the FABS after that if it is still in use.
16208   if (IsFABS)
16209     for (SDNode *User : Op->uses())
16210       if (User->getOpcode() == ISD::FNEG)
16211         return Op;
16212
16213   SDLoc dl(Op);
16214   MVT VT = Op.getSimpleValueType();
16215
16216   bool IsF128 = (VT == MVT::f128);
16217
16218   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16219   // decide if we should generate a 16-byte constant mask when we only need 4 or
16220   // 8 bytes for the scalar case.
16221
16222   MVT LogicVT;
16223   MVT EltVT;
16224
16225   if (VT.isVector()) {
16226     LogicVT = VT;
16227     EltVT = VT.getVectorElementType();
16228   } else if (IsF128) {
16229     // SSE instructions are used for optimized f128 logical operations.
16230     LogicVT = MVT::f128;
16231     EltVT = VT;
16232   } else {
16233     // There are no scalar bitwise logical SSE/AVX instructions, so we
16234     // generate a 16-byte vector constant and logic op even for the scalar case.
16235     // Using a 16-byte mask allows folding the load of the mask with
16236     // the logic op, so it can save (~4 bytes) on code size.
16237     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16238     EltVT = VT;
16239   }
16240
16241   unsigned EltBits = EltVT.getSizeInBits();
16242   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16243   APInt MaskElt =
16244     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16245   const fltSemantics &Sem =
16246       EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16247           (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16248   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16249
16250   SDValue Op0 = Op.getOperand(0);
16251   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16252   unsigned LogicOp =
16253     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16254   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16255
16256   if (VT.isVector() || IsF128)
16257     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16258
16259   // For the scalar case extend to a 128-bit vector, perform the logic op,
16260   // and extract the scalar result back out.
16261   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16262   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16263   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16264                      DAG.getIntPtrConstant(0, dl));
16265 }
16266
16267 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16268   SDValue Mag = Op.getOperand(0);
16269   SDValue Sign = Op.getOperand(1);
16270   SDLoc dl(Op);
16271
16272   // If the sign operand is smaller, extend it first.
16273   MVT VT = Op.getSimpleValueType();
16274   if (Sign.getSimpleValueType().bitsLT(VT))
16275     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16276
16277   // And if it is bigger, shrink it first.
16278   if (Sign.getSimpleValueType().bitsGT(VT))
16279     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16280
16281   // At this point the operands and the result should have the same
16282   // type, and that won't be f80 since that is not custom lowered.
16283   bool IsF128 = (VT == MVT::f128);
16284   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16285           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16286           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16287          "Unexpected type in LowerFCOPYSIGN");
16288
16289   MVT EltVT = VT.getScalarType();
16290   const fltSemantics &Sem =
16291       EltVT == MVT::f64 ? APFloat::IEEEdouble()
16292                         : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16293
16294   // Perform all scalar logic operations as 16-byte vectors because there are no
16295   // scalar FP logic instructions in SSE.
16296   // TODO: This isn't necessary. If we used scalar types, we might avoid some
16297   // unnecessary splats, but we might miss load folding opportunities. Should
16298   // this decision be based on OptimizeForSize?
16299   bool IsFakeVector = !VT.isVector() && !IsF128;
16300   MVT LogicVT = VT;
16301   if (IsFakeVector)
16302     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16303
16304   // The mask constants are automatically splatted for vector types.
16305   unsigned EltSizeInBits = VT.getScalarSizeInBits();
16306   SDValue SignMask = DAG.getConstantFP(
16307       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16308   SDValue MagMask = DAG.getConstantFP(
16309       APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16310
16311   // First, clear all bits but the sign bit from the second operand (sign).
16312   if (IsFakeVector)
16313     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16314   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16315
16316   // Next, clear the sign bit from the first operand (magnitude).
16317   // TODO: If we had general constant folding for FP logic ops, this check
16318   // wouldn't be necessary.
16319   SDValue MagBits;
16320   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16321     APFloat APF = Op0CN->getValueAPF();
16322     APF.clearSign();
16323     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16324   } else {
16325     // If the magnitude operand wasn't a constant, we need to AND out the sign.
16326     if (IsFakeVector)
16327       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16328     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16329   }
16330
16331   // OR the magnitude value with the sign bit.
16332   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16333   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16334                                           DAG.getIntPtrConstant(0, dl));
16335 }
16336
16337 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16338   SDValue N0 = Op.getOperand(0);
16339   SDLoc dl(Op);
16340   MVT VT = Op.getSimpleValueType();
16341
16342   MVT OpVT = N0.getSimpleValueType();
16343   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16344          "Unexpected type for FGETSIGN");
16345
16346   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16347   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16348   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16349   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16350   Res = DAG.getZExtOrTrunc(Res, dl, VT);
16351   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16352   return Res;
16353 }
16354
16355 // Check whether an OR'd tree is PTEST-able.
16356 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16357                                       SelectionDAG &DAG) {
16358   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16359
16360   if (!Subtarget.hasSSE41())
16361     return SDValue();
16362
16363   if (!Op->hasOneUse())
16364     return SDValue();
16365
16366   SDNode *N = Op.getNode();
16367   SDLoc DL(N);
16368
16369   SmallVector<SDValue, 8> Opnds;
16370   DenseMap<SDValue, unsigned> VecInMap;
16371   SmallVector<SDValue, 8> VecIns;
16372   EVT VT = MVT::Other;
16373
16374   // Recognize a special case where a vector is casted into wide integer to
16375   // test all 0s.
16376   Opnds.push_back(N->getOperand(0));
16377   Opnds.push_back(N->getOperand(1));
16378
16379   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16380     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16381     // BFS traverse all OR'd operands.
16382     if (I->getOpcode() == ISD::OR) {
16383       Opnds.push_back(I->getOperand(0));
16384       Opnds.push_back(I->getOperand(1));
16385       // Re-evaluate the number of nodes to be traversed.
16386       e += 2; // 2 more nodes (LHS and RHS) are pushed.
16387       continue;
16388     }
16389
16390     // Quit if a non-EXTRACT_VECTOR_ELT
16391     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16392       return SDValue();
16393
16394     // Quit if without a constant index.
16395     SDValue Idx = I->getOperand(1);
16396     if (!isa<ConstantSDNode>(Idx))
16397       return SDValue();
16398
16399     SDValue ExtractedFromVec = I->getOperand(0);
16400     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16401     if (M == VecInMap.end()) {
16402       VT = ExtractedFromVec.getValueType();
16403       // Quit if not 128/256-bit vector.
16404       if (!VT.is128BitVector() && !VT.is256BitVector())
16405         return SDValue();
16406       // Quit if not the same type.
16407       if (VecInMap.begin() != VecInMap.end() &&
16408           VT != VecInMap.begin()->first.getValueType())
16409         return SDValue();
16410       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16411       VecIns.push_back(ExtractedFromVec);
16412     }
16413     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16414   }
16415
16416   assert((VT.is128BitVector() || VT.is256BitVector()) &&
16417          "Not extracted from 128-/256-bit vector.");
16418
16419   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16420
16421   for (DenseMap<SDValue, unsigned>::const_iterator
16422         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16423     // Quit if not all elements are used.
16424     if (I->second != FullMask)
16425       return SDValue();
16426   }
16427
16428   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16429
16430   // Cast all vectors into TestVT for PTEST.
16431   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16432     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16433
16434   // If more than one full vector is evaluated, OR them first before PTEST.
16435   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16436     // Each iteration will OR 2 nodes and append the result until there is only
16437     // 1 node left, i.e. the final OR'd value of all vectors.
16438     SDValue LHS = VecIns[Slot];
16439     SDValue RHS = VecIns[Slot + 1];
16440     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16441   }
16442
16443   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16444 }
16445
16446 /// \brief return true if \c Op has a use that doesn't just read flags.
16447 static bool hasNonFlagsUse(SDValue Op) {
16448   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16449        ++UI) {
16450     SDNode *User = *UI;
16451     unsigned UOpNo = UI.getOperandNo();
16452     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16453       // Look pass truncate.
16454       UOpNo = User->use_begin().getOperandNo();
16455       User = *User->use_begin();
16456     }
16457
16458     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16459         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16460       return true;
16461   }
16462   return false;
16463 }
16464
16465 // Emit KTEST instruction for bit vectors on AVX-512
16466 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16467                          const X86Subtarget &Subtarget) {
16468   if (Op.getOpcode() == ISD::BITCAST) {
16469     auto hasKTEST = [&](MVT VT) {
16470       unsigned SizeInBits = VT.getSizeInBits();
16471       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16472         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16473     };
16474     SDValue Op0 = Op.getOperand(0);
16475     MVT Op0VT = Op0.getValueType().getSimpleVT();
16476     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16477         hasKTEST(Op0VT))
16478       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16479   }
16480   return SDValue();
16481 }
16482
16483 /// Emit nodes that will be selected as "test Op0,Op0", or something
16484 /// equivalent.
16485 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16486                                     SelectionDAG &DAG) const {
16487   if (Op.getValueType() == MVT::i1) {
16488     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16489     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16490                        DAG.getConstant(0, dl, MVT::i8));
16491   }
16492   // CF and OF aren't always set the way we want. Determine which
16493   // of these we need.
16494   bool NeedCF = false;
16495   bool NeedOF = false;
16496   switch (X86CC) {
16497   default: break;
16498   case X86::COND_A: case X86::COND_AE:
16499   case X86::COND_B: case X86::COND_BE:
16500     NeedCF = true;
16501     break;
16502   case X86::COND_G: case X86::COND_GE:
16503   case X86::COND_L: case X86::COND_LE:
16504   case X86::COND_O: case X86::COND_NO: {
16505     // Check if we really need to set the
16506     // Overflow flag. If NoSignedWrap is present
16507     // that is not actually needed.
16508     switch (Op->getOpcode()) {
16509     case ISD::ADD:
16510     case ISD::SUB:
16511     case ISD::MUL:
16512     case ISD::SHL:
16513       if (Op.getNode()->getFlags().hasNoSignedWrap())
16514         break;
16515       LLVM_FALLTHROUGH;
16516     default:
16517       NeedOF = true;
16518       break;
16519     }
16520     break;
16521   }
16522   }
16523   // See if we can use the EFLAGS value from the operand instead of
16524   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16525   // we prove that the arithmetic won't overflow, we can't use OF or CF.
16526   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16527     // Emit KTEST for bit vectors
16528     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16529       return Node;
16530     // Emit a CMP with 0, which is the TEST pattern.
16531     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16532                        DAG.getConstant(0, dl, Op.getValueType()));
16533   }
16534   unsigned Opcode = 0;
16535   unsigned NumOperands = 0;
16536
16537   // Truncate operations may prevent the merge of the SETCC instruction
16538   // and the arithmetic instruction before it. Attempt to truncate the operands
16539   // of the arithmetic instruction and use a reduced bit-width instruction.
16540   bool NeedTruncation = false;
16541   SDValue ArithOp = Op;
16542   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16543     SDValue Arith = Op->getOperand(0);
16544     // Both the trunc and the arithmetic op need to have one user each.
16545     if (Arith->hasOneUse())
16546       switch (Arith.getOpcode()) {
16547         default: break;
16548         case ISD::ADD:
16549         case ISD::SUB:
16550         case ISD::AND:
16551         case ISD::OR:
16552         case ISD::XOR: {
16553           NeedTruncation = true;
16554           ArithOp = Arith;
16555         }
16556       }
16557   }
16558
16559   // Sometimes flags can be set either with an AND or with an SRL/SHL
16560   // instruction. SRL/SHL variant should be preferred for masks longer than this
16561   // number of bits.
16562   const int ShiftToAndMaxMaskWidth = 32;
16563   const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16564
16565   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16566   // which may be the result of a CAST.  We use the variable 'Op', which is the
16567   // non-casted variable when we check for possible users.
16568   switch (ArithOp.getOpcode()) {
16569   case ISD::ADD:
16570     // Due to an isel shortcoming, be conservative if this add is likely to be
16571     // selected as part of a load-modify-store instruction. When the root node
16572     // in a match is a store, isel doesn't know how to remap non-chain non-flag
16573     // uses of other nodes in the match, such as the ADD in this case. This
16574     // leads to the ADD being left around and reselected, with the result being
16575     // two adds in the output.  Alas, even if none our users are stores, that
16576     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
16577     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
16578     // climbing the DAG back to the root, and it doesn't seem to be worth the
16579     // effort.
16580     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16581          UE = Op.getNode()->use_end(); UI != UE; ++UI)
16582       if (UI->getOpcode() != ISD::CopyToReg &&
16583           UI->getOpcode() != ISD::SETCC &&
16584           UI->getOpcode() != ISD::STORE)
16585         goto default_case;
16586
16587     if (ConstantSDNode *C =
16588         dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16589       // An add of one will be selected as an INC.
16590       if (C->isOne() && !Subtarget.slowIncDec()) {
16591         Opcode = X86ISD::INC;
16592         NumOperands = 1;
16593         break;
16594       }
16595
16596       // An add of negative one (subtract of one) will be selected as a DEC.
16597       if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16598         Opcode = X86ISD::DEC;
16599         NumOperands = 1;
16600         break;
16601       }
16602     }
16603
16604     // Otherwise use a regular EFLAGS-setting add.
16605     Opcode = X86ISD::ADD;
16606     NumOperands = 2;
16607     break;
16608   case ISD::SHL:
16609   case ISD::SRL:
16610     // If we have a constant logical shift that's only used in a comparison
16611     // against zero turn it into an equivalent AND. This allows turning it into
16612     // a TEST instruction later.
16613     if (ZeroCheck && Op->hasOneUse() &&
16614         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16615       EVT VT = Op.getValueType();
16616       unsigned BitWidth = VT.getSizeInBits();
16617       unsigned ShAmt = Op->getConstantOperandVal(1);
16618       if (ShAmt >= BitWidth) // Avoid undefined shifts.
16619         break;
16620       APInt Mask = ArithOp.getOpcode() == ISD::SRL
16621                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16622                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16623       if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16624         break;
16625       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16626                        DAG.getConstant(Mask, dl, VT));
16627     }
16628     break;
16629
16630   case ISD::AND:
16631     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16632     // because a TEST instruction will be better. However, AND should be
16633     // preferred if the instruction can be combined into ANDN.
16634     if (!hasNonFlagsUse(Op)) {
16635       SDValue Op0 = ArithOp->getOperand(0);
16636       SDValue Op1 = ArithOp->getOperand(1);
16637       EVT VT = ArithOp.getValueType();
16638       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16639       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16640       bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16641
16642       // If we cannot select an ANDN instruction, check if we can replace
16643       // AND+IMM64 with a shift before giving up. This is possible for masks
16644       // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16645       if (!isProperAndn) {
16646         if (!ZeroCheck)
16647           break;
16648
16649         assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16650         auto *CN = dyn_cast<ConstantSDNode>(Op1);
16651         if (!CN)
16652           break;
16653
16654         const APInt &Mask = CN->getAPIntValue();
16655         if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16656           break; // Prefer TEST instruction.
16657
16658         unsigned BitWidth = Mask.getBitWidth();
16659         unsigned LeadingOnes = Mask.countLeadingOnes();
16660         unsigned TrailingZeros = Mask.countTrailingZeros();
16661
16662         if (LeadingOnes + TrailingZeros == BitWidth) {
16663           assert(TrailingZeros < VT.getSizeInBits() &&
16664                  "Shift amount should be less than the type width");
16665           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16666           SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16667           Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16668           break;
16669         }
16670
16671         unsigned LeadingZeros = Mask.countLeadingZeros();
16672         unsigned TrailingOnes = Mask.countTrailingOnes();
16673
16674         if (LeadingZeros + TrailingOnes == BitWidth) {
16675           assert(LeadingZeros < VT.getSizeInBits() &&
16676                  "Shift amount should be less than the type width");
16677           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16678           SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16679           Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16680           break;
16681         }
16682
16683         break;
16684       }
16685     }
16686     LLVM_FALLTHROUGH;
16687   case ISD::SUB:
16688   case ISD::OR:
16689   case ISD::XOR:
16690     // Due to the ISEL shortcoming noted above, be conservative if this op is
16691     // likely to be selected as part of a load-modify-store instruction.
16692     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16693            UE = Op.getNode()->use_end(); UI != UE; ++UI)
16694       if (UI->getOpcode() == ISD::STORE)
16695         goto default_case;
16696
16697     // Otherwise use a regular EFLAGS-setting instruction.
16698     switch (ArithOp.getOpcode()) {
16699     default: llvm_unreachable("unexpected operator!");
16700     case ISD::SUB: Opcode = X86ISD::SUB; break;
16701     case ISD::XOR: Opcode = X86ISD::XOR; break;
16702     case ISD::AND: Opcode = X86ISD::AND; break;
16703     case ISD::OR: {
16704       if (!NeedTruncation && ZeroCheck) {
16705         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16706           return EFLAGS;
16707       }
16708       Opcode = X86ISD::OR;
16709       break;
16710     }
16711     }
16712
16713     NumOperands = 2;
16714     break;
16715   case X86ISD::ADD:
16716   case X86ISD::SUB:
16717   case X86ISD::INC:
16718   case X86ISD::DEC:
16719   case X86ISD::OR:
16720   case X86ISD::XOR:
16721   case X86ISD::AND:
16722     return SDValue(Op.getNode(), 1);
16723   default:
16724   default_case:
16725     break;
16726   }
16727
16728   // If we found that truncation is beneficial, perform the truncation and
16729   // update 'Op'.
16730   if (NeedTruncation) {
16731     EVT VT = Op.getValueType();
16732     SDValue WideVal = Op->getOperand(0);
16733     EVT WideVT = WideVal.getValueType();
16734     unsigned ConvertedOp = 0;
16735     // Use a target machine opcode to prevent further DAGCombine
16736     // optimizations that may separate the arithmetic operations
16737     // from the setcc node.
16738     switch (WideVal.getOpcode()) {
16739       default: break;
16740       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16741       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16742       case ISD::AND: ConvertedOp = X86ISD::AND; break;
16743       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
16744       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16745     }
16746
16747     if (ConvertedOp) {
16748       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16749       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16750         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16751         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16752         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16753       }
16754     }
16755   }
16756
16757   if (Opcode == 0) {
16758     // Emit KTEST for bit vectors
16759     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16760       return Node;
16761
16762     // Emit a CMP with 0, which is the TEST pattern.
16763     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16764                        DAG.getConstant(0, dl, Op.getValueType()));
16765   }
16766   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16767   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16768
16769   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16770   DAG.ReplaceAllUsesWith(Op, New);
16771   return SDValue(New.getNode(), 1);
16772 }
16773
16774 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16775 /// equivalent.
16776 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16777                                    const SDLoc &dl, SelectionDAG &DAG) const {
16778   if (isNullConstant(Op1))
16779     return EmitTest(Op0, X86CC, dl, DAG);
16780
16781   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16782          "Unexpected comparison operation for MVT::i1 operands");
16783
16784   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16785        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16786     // Only promote the compare up to I32 if it is a 16 bit operation
16787     // with an immediate.  16 bit immediates are to be avoided.
16788     if ((Op0.getValueType() == MVT::i16 &&
16789          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16790         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16791         !Subtarget.isAtom()) {
16792       unsigned ExtendOp =
16793           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16794       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16795       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16796     }
16797     // Use SUB instead of CMP to enable CSE between SUB and CMP.
16798     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16799     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16800                               Op0, Op1);
16801     return SDValue(Sub.getNode(), 1);
16802   }
16803   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16804 }
16805
16806 /// Convert a comparison if required by the subtarget.
16807 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16808                                                  SelectionDAG &DAG) const {
16809   // If the subtarget does not support the FUCOMI instruction, floating-point
16810   // comparisons have to be converted.
16811   if (Subtarget.hasCMov() ||
16812       Cmp.getOpcode() != X86ISD::CMP ||
16813       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16814       !Cmp.getOperand(1).getValueType().isFloatingPoint())
16815     return Cmp;
16816
16817   // The instruction selector will select an FUCOM instruction instead of
16818   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16819   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16820   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16821   SDLoc dl(Cmp);
16822   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16823   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16824   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16825                             DAG.getConstant(8, dl, MVT::i8));
16826   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16827
16828   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16829   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16830   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16831 }
16832
16833 /// Check if replacement of SQRT with RSQRT should be disabled.
16834 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16835   EVT VT = Op.getValueType();
16836
16837   // We never want to use both SQRT and RSQRT instructions for the same input.
16838   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16839     return false;
16840
16841   if (VT.isVector())
16842     return Subtarget.hasFastVectorFSQRT();
16843   return Subtarget.hasFastScalarFSQRT();
16844 }
16845
16846 /// The minimum architected relative accuracy is 2^-12. We need one
16847 /// Newton-Raphson step to have a good float result (24 bits of precision).
16848 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16849                                            SelectionDAG &DAG, int Enabled,
16850                                            int &RefinementSteps,
16851                                            bool &UseOneConstNR,
16852                                            bool Reciprocal) const {
16853   EVT VT = Op.getValueType();
16854
16855   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16856   // TODO: Add support for AVX512 (v16f32).
16857   // It is likely not profitable to do this for f64 because a double-precision
16858   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16859   // instructions: convert to single, rsqrtss, convert back to double, refine
16860   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16861   // along with FMA, this could be a throughput win.
16862   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16863       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16864       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16865     if (RefinementSteps == ReciprocalEstimate::Unspecified)
16866       RefinementSteps = 1;
16867
16868     UseOneConstNR = false;
16869     return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16870   }
16871   return SDValue();
16872 }
16873
16874 /// The minimum architected relative accuracy is 2^-12. We need one
16875 /// Newton-Raphson step to have a good float result (24 bits of precision).
16876 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16877                                             int Enabled,
16878                                             int &RefinementSteps) const {
16879   EVT VT = Op.getValueType();
16880
16881   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16882   // TODO: Add support for AVX512 (v16f32).
16883   // It is likely not profitable to do this for f64 because a double-precision
16884   // reciprocal estimate with refinement on x86 prior to FMA requires
16885   // 15 instructions: convert to single, rcpss, convert back to double, refine
16886   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16887   // along with FMA, this could be a throughput win.
16888
16889   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16890       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16891       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16892     // Enable estimate codegen with 1 refinement step for vector division.
16893     // Scalar division estimates are disabled because they break too much
16894     // real-world code. These defaults are intended to match GCC behavior.
16895     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16896       return SDValue();
16897
16898     if (RefinementSteps == ReciprocalEstimate::Unspecified)
16899       RefinementSteps = 1;
16900
16901     return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16902   }
16903   return SDValue();
16904 }
16905
16906 /// If we have at least two divisions that use the same divisor, convert to
16907 /// multiplication by a reciprocal. This may need to be adjusted for a given
16908 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16909 /// This is because we still need one division to calculate the reciprocal and
16910 /// then we need two multiplies by that reciprocal as replacements for the
16911 /// original divisions.
16912 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16913   return 2;
16914 }
16915
16916 /// Helper for creating a X86ISD::SETCC node.
16917 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16918                         SelectionDAG &DAG) {
16919   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16920                      DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16921 }
16922
16923 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16924 /// according to equal/not-equal condition code \p CC.
16925 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16926                                    const SDLoc &dl, SelectionDAG &DAG) {
16927   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
16928   // instruction.  Since the shift amount is in-range-or-undefined, we know
16929   // that doing a bittest on the i32 value is ok.  We extend to i32 because
16930   // the encoding for the i16 version is larger than the i32 version.
16931   // Also promote i16 to i32 for performance / code size reason.
16932   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16933     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16934
16935   // See if we can use the 32-bit instruction instead of the 64-bit one for a
16936   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16937   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16938   // known to be zero.
16939   if (Src.getValueType() == MVT::i64 &&
16940       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16941     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16942
16943   // If the operand types disagree, extend the shift amount to match.  Since
16944   // BT ignores high bits (like shifts) we can use anyextend.
16945   if (Src.getValueType() != BitNo.getValueType())
16946     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16947
16948   SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16949   X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16950   return getSETCC(Cond, BT, dl , DAG);
16951 }
16952
16953 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16954 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16955                             const SDLoc &dl, SelectionDAG &DAG) {
16956   SDValue Op0 = And.getOperand(0);
16957   SDValue Op1 = And.getOperand(1);
16958   if (Op0.getOpcode() == ISD::TRUNCATE)
16959     Op0 = Op0.getOperand(0);
16960   if (Op1.getOpcode() == ISD::TRUNCATE)
16961     Op1 = Op1.getOperand(0);
16962
16963   SDValue LHS, RHS;
16964   if (Op1.getOpcode() == ISD::SHL)
16965     std::swap(Op0, Op1);
16966   if (Op0.getOpcode() == ISD::SHL) {
16967     if (isOneConstant(Op0.getOperand(0))) {
16968       // If we looked past a truncate, check that it's only truncating away
16969       // known zeros.
16970       unsigned BitWidth = Op0.getValueSizeInBits();
16971       unsigned AndBitWidth = And.getValueSizeInBits();
16972       if (BitWidth > AndBitWidth) {
16973         KnownBits Known;
16974         DAG.computeKnownBits(Op0, Known);
16975         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16976           return SDValue();
16977       }
16978       LHS = Op1;
16979       RHS = Op0.getOperand(1);
16980     }
16981   } else if (Op1.getOpcode() == ISD::Constant) {
16982     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16983     uint64_t AndRHSVal = AndRHS->getZExtValue();
16984     SDValue AndLHS = Op0;
16985
16986     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16987       LHS = AndLHS.getOperand(0);
16988       RHS = AndLHS.getOperand(1);
16989     }
16990
16991     // Use BT if the immediate can't be encoded in a TEST instruction.
16992     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16993       LHS = AndLHS;
16994       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16995     }
16996   }
16997
16998   if (LHS.getNode())
16999     return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17000
17001   return SDValue();
17002 }
17003
17004 // Convert (truncate (srl X, N) to i1) to (bt X, N)
17005 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
17006                                  const SDLoc &dl, SelectionDAG &DAG) {
17007
17008   assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
17009          "Expected TRUNCATE to i1 node");
17010
17011   if (Op.getOperand(0).getOpcode() != ISD::SRL)
17012     return SDValue();
17013
17014   SDValue ShiftRight = Op.getOperand(0);
17015   return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
17016                              CC, dl, DAG);
17017 }
17018
17019 /// Result of 'and' or 'trunc to i1' is compared against zero.
17020 /// Change to a BT node if possible.
17021 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
17022                                      const SDLoc &dl, SelectionDAG &DAG) const {
17023   if (Op.getOpcode() == ISD::AND)
17024     return LowerAndToBT(Op, CC, dl, DAG);
17025   if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
17026     return LowerTruncateToBT(Op, CC, dl, DAG);
17027   return SDValue();
17028 }
17029
17030 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17031 /// CMPs.
17032 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17033                               SDValue &Op1) {
17034   unsigned SSECC;
17035   bool Swap = false;
17036
17037   // SSE Condition code mapping:
17038   //  0 - EQ
17039   //  1 - LT
17040   //  2 - LE
17041   //  3 - UNORD
17042   //  4 - NEQ
17043   //  5 - NLT
17044   //  6 - NLE
17045   //  7 - ORD
17046   switch (SetCCOpcode) {
17047   default: llvm_unreachable("Unexpected SETCC condition");
17048   case ISD::SETOEQ:
17049   case ISD::SETEQ:  SSECC = 0; break;
17050   case ISD::SETOGT:
17051   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
17052   case ISD::SETLT:
17053   case ISD::SETOLT: SSECC = 1; break;
17054   case ISD::SETOGE:
17055   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
17056   case ISD::SETLE:
17057   case ISD::SETOLE: SSECC = 2; break;
17058   case ISD::SETUO:  SSECC = 3; break;
17059   case ISD::SETUNE:
17060   case ISD::SETNE:  SSECC = 4; break;
17061   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17062   case ISD::SETUGE: SSECC = 5; break;
17063   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17064   case ISD::SETUGT: SSECC = 6; break;
17065   case ISD::SETO:   SSECC = 7; break;
17066   case ISD::SETUEQ:
17067   case ISD::SETONE: SSECC = 8; break;
17068   }
17069   if (Swap)
17070     std::swap(Op0, Op1);
17071
17072   return SSECC;
17073 }
17074
17075 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17076 /// concatenate the result back.
17077 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17078   MVT VT = Op.getSimpleValueType();
17079
17080   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17081          "Unsupported value type for operation");
17082
17083   unsigned NumElems = VT.getVectorNumElements();
17084   SDLoc dl(Op);
17085   SDValue CC = Op.getOperand(2);
17086
17087   // Extract the LHS vectors
17088   SDValue LHS = Op.getOperand(0);
17089   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17090   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17091
17092   // Extract the RHS vectors
17093   SDValue RHS = Op.getOperand(1);
17094   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17095   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17096
17097   // Issue the operation on the smaller types and concatenate the result back
17098   MVT EltVT = VT.getVectorElementType();
17099   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17100   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17101                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17102                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17103 }
17104
17105 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17106   SDValue Op0 = Op.getOperand(0);
17107   SDValue Op1 = Op.getOperand(1);
17108   SDValue CC = Op.getOperand(2);
17109   MVT VT = Op.getSimpleValueType();
17110   SDLoc dl(Op);
17111
17112   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17113          "Unexpected type for boolean compare operation");
17114   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17115   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17116                                DAG.getConstant(-1, dl, VT));
17117   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17118                                DAG.getConstant(-1, dl, VT));
17119   switch (SetCCOpcode) {
17120   default: llvm_unreachable("Unexpected SETCC condition");
17121   case ISD::SETEQ:
17122     // (x == y) -> ~(x ^ y)
17123     return DAG.getNode(ISD::XOR, dl, VT,
17124                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17125                        DAG.getConstant(-1, dl, VT));
17126   case ISD::SETNE:
17127     // (x != y) -> (x ^ y)
17128     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17129   case ISD::SETUGT:
17130   case ISD::SETGT:
17131     // (x > y) -> (x & ~y)
17132     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17133   case ISD::SETULT:
17134   case ISD::SETLT:
17135     // (x < y) -> (~x & y)
17136     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17137   case ISD::SETULE:
17138   case ISD::SETLE:
17139     // (x <= y) -> (~x | y)
17140     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17141   case ISD::SETUGE:
17142   case ISD::SETGE:
17143     // (x >=y) -> (x | ~y)
17144     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17145   }
17146 }
17147
17148 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17149
17150   SDValue Op0 = Op.getOperand(0);
17151   SDValue Op1 = Op.getOperand(1);
17152   SDValue CC = Op.getOperand(2);
17153   MVT VT = Op.getSimpleValueType();
17154   SDLoc dl(Op);
17155
17156   assert(VT.getVectorElementType() == MVT::i1 &&
17157          "Cannot set masked compare for this operation");
17158
17159   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17160   unsigned  Opc = 0;
17161   bool Unsigned = false;
17162   bool Swap = false;
17163   unsigned SSECC;
17164   switch (SetCCOpcode) {
17165   default: llvm_unreachable("Unexpected SETCC condition");
17166   case ISD::SETNE:  SSECC = 4; break;
17167   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
17168   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17169   case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
17170   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
17171   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17172   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17173   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
17174   case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17175   case ISD::SETLE:  SSECC = 2; break;
17176   }
17177
17178   if (Swap)
17179     std::swap(Op0, Op1);
17180   if (Opc)
17181     return DAG.getNode(Opc, dl, VT, Op0, Op1);
17182   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17183   return DAG.getNode(Opc, dl, VT, Op0, Op1,
17184                      DAG.getConstant(SSECC, dl, MVT::i8));
17185 }
17186
17187 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17188 /// operand \p Op1.  If non-trivial (for example because it's not constant)
17189 /// return an empty value.
17190 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17191                                       SelectionDAG &DAG) {
17192   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17193   if (!BV)
17194     return SDValue();
17195
17196   MVT VT = Op1.getSimpleValueType();
17197   MVT EVT = VT.getVectorElementType();
17198   unsigned n = VT.getVectorNumElements();
17199   SmallVector<SDValue, 8> ULTOp1;
17200
17201   for (unsigned i = 0; i < n; ++i) {
17202     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17203     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17204       return SDValue();
17205
17206     // Avoid underflow.
17207     APInt Val = Elt->getAPIntValue();
17208     if (Val == 0)
17209       return SDValue();
17210
17211     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17212   }
17213
17214   return DAG.getBuildVector(VT, dl, ULTOp1);
17215 }
17216
17217 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17218                            SelectionDAG &DAG) {
17219   SDValue Op0 = Op.getOperand(0);
17220   SDValue Op1 = Op.getOperand(1);
17221   SDValue CC = Op.getOperand(2);
17222   MVT VT = Op.getSimpleValueType();
17223   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17224   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17225   SDLoc dl(Op);
17226
17227   if (isFP) {
17228 #ifndef NDEBUG
17229     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17230     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17231 #endif
17232
17233     unsigned Opc;
17234     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17235       assert(VT.getVectorNumElements() <= 16);
17236       Opc = X86ISD::CMPM;
17237     } else {
17238       Opc = X86ISD::CMPP;
17239       // The SSE/AVX packed FP comparison nodes are defined with a
17240       // floating-point vector result that matches the operand type. This allows
17241       // them to work with an SSE1 target (integer vector types are not legal).
17242       VT = Op0.getSimpleValueType();
17243     }
17244
17245     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17246     // emit two comparisons and a logic op to tie them together.
17247     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17248     // available.
17249     SDValue Cmp;
17250     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17251     if (SSECC == 8) {
17252       // LLVM predicate is SETUEQ or SETONE.
17253       unsigned CC0, CC1;
17254       unsigned CombineOpc;
17255       if (Cond == ISD::SETUEQ) {
17256         CC0 = 3; // UNORD
17257         CC1 = 0; // EQ
17258         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17259                                            static_cast<unsigned>(ISD::OR);
17260       } else {
17261         assert(Cond == ISD::SETONE);
17262         CC0 = 7; // ORD
17263         CC1 = 4; // NEQ
17264         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17265                                            static_cast<unsigned>(ISD::AND);
17266       }
17267
17268       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17269                                  DAG.getConstant(CC0, dl, MVT::i8));
17270       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17271                                  DAG.getConstant(CC1, dl, MVT::i8));
17272       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17273     } else {
17274       // Handle all other FP comparisons here.
17275       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17276                         DAG.getConstant(SSECC, dl, MVT::i8));
17277     }
17278
17279     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17280     // result type of SETCC. The bitcast is expected to be optimized away
17281     // during combining/isel.
17282     if (Opc == X86ISD::CMPP)
17283       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17284
17285     return Cmp;
17286   }
17287
17288   MVT VTOp0 = Op0.getSimpleValueType();
17289   assert(VTOp0 == Op1.getSimpleValueType() &&
17290          "Expected operands with same type!");
17291   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17292          "Invalid number of packed elements for source and destination!");
17293
17294   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17295     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17296     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
17297     // legalizer firstly checks if the first operand in input to the setcc has
17298     // a legal type. If so, then it promotes the return type to that same type.
17299     // Otherwise, the return type is promoted to the 'next legal type' which,
17300     // for a vector of MVT::i1 is always a 128-bit integer vector type.
17301     //
17302     // We reach this code only if the following two conditions are met:
17303     // 1. Both return type and operand type have been promoted to wider types
17304     //    by the type legalizer.
17305     // 2. The original operand type has been promoted to a 256-bit vector.
17306     //
17307     // Note that condition 2. only applies for AVX targets.
17308     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17309     return DAG.getZExtOrTrunc(NewOp, dl, VT);
17310   }
17311
17312   // The non-AVX512 code below works under the assumption that source and
17313   // destination types are the same.
17314   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17315          "Value types for source and destination must be the same!");
17316
17317   // Break 256-bit integer vector compare into smaller ones.
17318   if (VT.is256BitVector() && !Subtarget.hasInt256())
17319     return Lower256IntVSETCC(Op, DAG);
17320
17321   // Operands are boolean (vectors of i1)
17322   MVT OpVT = Op1.getSimpleValueType();
17323   if (OpVT.getVectorElementType() == MVT::i1)
17324     return LowerBoolVSETCC_AVX512(Op, DAG);
17325
17326   // The result is boolean, but operands are int/float
17327   if (VT.getVectorElementType() == MVT::i1) {
17328     // In AVX-512 architecture setcc returns mask with i1 elements,
17329     // But there is no compare instruction for i8 and i16 elements in KNL.
17330     // In this case use SSE compare
17331     bool UseAVX512Inst =
17332       (OpVT.is512BitVector() ||
17333        OpVT.getScalarSizeInBits() >= 32 ||
17334        (Subtarget.hasBWI() && Subtarget.hasVLX()));
17335
17336     if (UseAVX512Inst)
17337       return LowerIntVSETCC_AVX512(Op, DAG);
17338
17339     return DAG.getNode(ISD::TRUNCATE, dl, VT,
17340                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17341   }
17342
17343   // Lower using XOP integer comparisons.
17344   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17345        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17346     // Translate compare code to XOP PCOM compare mode.
17347     unsigned CmpMode = 0;
17348     switch (Cond) {
17349     default: llvm_unreachable("Unexpected SETCC condition");
17350     case ISD::SETULT:
17351     case ISD::SETLT: CmpMode = 0x00; break;
17352     case ISD::SETULE:
17353     case ISD::SETLE: CmpMode = 0x01; break;
17354     case ISD::SETUGT:
17355     case ISD::SETGT: CmpMode = 0x02; break;
17356     case ISD::SETUGE:
17357     case ISD::SETGE: CmpMode = 0x03; break;
17358     case ISD::SETEQ: CmpMode = 0x04; break;
17359     case ISD::SETNE: CmpMode = 0x05; break;
17360     }
17361
17362     // Are we comparing unsigned or signed integers?
17363     unsigned Opc =
17364         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17365
17366     return DAG.getNode(Opc, dl, VT, Op0, Op1,
17367                        DAG.getConstant(CmpMode, dl, MVT::i8));
17368   }
17369
17370   // We are handling one of the integer comparisons here. Since SSE only has
17371   // GT and EQ comparisons for integer, swapping operands and multiple
17372   // operations may be required for some comparisons.
17373   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17374                                                             : X86ISD::PCMPGT;
17375   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17376               Cond == ISD::SETGE || Cond == ISD::SETUGE;
17377   bool Invert = Cond == ISD::SETNE ||
17378                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17379
17380   // If both operands are known non-negative, then an unsigned compare is the
17381   // same as a signed compare and there's no need to flip signbits.
17382   // TODO: We could check for more general simplifications here since we're
17383   // computing known bits.
17384   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17385                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17386
17387   // Special case: Use min/max operations for SETULE/SETUGE
17388   MVT VET = VT.getVectorElementType();
17389   bool HasMinMax =
17390       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
17391       (Subtarget.hasSSE2() && (VET == MVT::i8));
17392   bool MinMax = false;
17393   if (HasMinMax) {
17394     switch (Cond) {
17395     default: break;
17396     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17397     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17398     }
17399
17400     if (MinMax)
17401       Swap = Invert = FlipSigns = false;
17402   }
17403
17404   bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17405   bool Subus = false;
17406   if (!MinMax && HasSubus) {
17407     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17408     // Op0 u<= Op1:
17409     //   t = psubus Op0, Op1
17410     //   pcmpeq t, <0..0>
17411     switch (Cond) {
17412     default: break;
17413     case ISD::SETULT: {
17414       // If the comparison is against a constant we can turn this into a
17415       // setule.  With psubus, setule does not require a swap.  This is
17416       // beneficial because the constant in the register is no longer
17417       // destructed as the destination so it can be hoisted out of a loop.
17418       // Only do this pre-AVX since vpcmp* is no longer destructive.
17419       if (Subtarget.hasAVX())
17420         break;
17421       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17422         Op1 = ULEOp1;
17423         Subus = true; Invert = false; Swap = false;
17424       }
17425       break;
17426     }
17427     // Psubus is better than flip-sign because it requires no inversion.
17428     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
17429     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17430     }
17431
17432     if (Subus) {
17433       Opc = X86ISD::SUBUS;
17434       FlipSigns = false;
17435     }
17436   }
17437
17438   if (Swap)
17439     std::swap(Op0, Op1);
17440
17441   // Check that the operation in question is available (most are plain SSE2,
17442   // but PCMPGTQ and PCMPEQQ have different requirements).
17443   if (VT == MVT::v2i64) {
17444     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17445       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17446
17447       // First cast everything to the right type.
17448       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17449       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17450
17451       // Since SSE has no unsigned integer comparisons, we need to flip the sign
17452       // bits of the inputs before performing those operations. The lower
17453       // compare is always unsigned.
17454       SDValue SB;
17455       if (FlipSigns) {
17456         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17457       } else {
17458         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17459         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17460         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17461       }
17462       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17463       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17464
17465       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17466       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17467       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17468
17469       // Create masks for only the low parts/high parts of the 64 bit integers.
17470       static const int MaskHi[] = { 1, 1, 3, 3 };
17471       static const int MaskLo[] = { 0, 0, 2, 2 };
17472       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17473       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17474       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17475
17476       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17477       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17478
17479       if (Invert)
17480         Result = DAG.getNOT(dl, Result, MVT::v4i32);
17481
17482       return DAG.getBitcast(VT, Result);
17483     }
17484
17485     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17486       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17487       // pcmpeqd + pshufd + pand.
17488       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17489
17490       // First cast everything to the right type.
17491       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17492       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17493
17494       // Do the compare.
17495       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17496
17497       // Make sure the lower and upper halves are both all-ones.
17498       static const int Mask[] = { 1, 0, 3, 2 };
17499       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17500       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17501
17502       if (Invert)
17503         Result = DAG.getNOT(dl, Result, MVT::v4i32);
17504
17505       return DAG.getBitcast(VT, Result);
17506     }
17507   }
17508
17509   // Since SSE has no unsigned integer comparisons, we need to flip the sign
17510   // bits of the inputs before performing those operations.
17511   if (FlipSigns) {
17512     MVT EltVT = VT.getVectorElementType();
17513     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17514                                  VT);
17515     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17516     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17517   }
17518
17519   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17520
17521   // If the logical-not of the result is required, perform that now.
17522   if (Invert)
17523     Result = DAG.getNOT(dl, Result, VT);
17524
17525   if (MinMax)
17526     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17527
17528   if (Subus)
17529     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17530                          getZeroVector(VT, Subtarget, DAG, dl));
17531
17532   return Result;
17533 }
17534
17535 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17536
17537   MVT VT = Op.getSimpleValueType();
17538
17539   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17540
17541   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17542   SDValue Op0 = Op.getOperand(0);
17543   SDValue Op1 = Op.getOperand(1);
17544   SDLoc dl(Op);
17545   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17546
17547   // Optimize to BT if possible.
17548   // Lower (X & (1 << N)) == 0 to BT(X, N).
17549   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17550   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17551   // Lower (trunc (X >> N) to i1) to BT(X, N).
17552   if (Op0.hasOneUse() && isNullConstant(Op1) &&
17553       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17554     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17555       if (VT == MVT::i1)
17556         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17557       return NewSetCC;
17558     }
17559   }
17560
17561   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
17562   // these.
17563   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17564       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17565
17566     // If the input is a setcc, then reuse the input setcc or use a new one with
17567     // the inverted condition.
17568     if (Op0.getOpcode() == X86ISD::SETCC) {
17569       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17570       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17571       if (!Invert)
17572         return Op0;
17573
17574       CCode = X86::GetOppositeBranchCondition(CCode);
17575       SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17576       if (VT == MVT::i1)
17577         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17578       return SetCC;
17579     }
17580   }
17581   if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17582     if (isOneConstant(Op1)) {
17583       ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17584       return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17585     }
17586     if (!isNullConstant(Op1)) {
17587       SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17588       return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17589     }
17590   }
17591
17592   bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17593   X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17594   if (X86CC == X86::COND_INVALID)
17595     return SDValue();
17596
17597   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17598   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17599   SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17600   if (VT == MVT::i1)
17601     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17602   return SetCC;
17603 }
17604
17605 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17606   SDValue LHS = Op.getOperand(0);
17607   SDValue RHS = Op.getOperand(1);
17608   SDValue Carry = Op.getOperand(2);
17609   SDValue Cond = Op.getOperand(3);
17610   SDLoc DL(Op);
17611
17612   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17613   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17614
17615   // Recreate the carry if needed.
17616   EVT CarryVT = Carry.getValueType();
17617   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17618   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17619                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
17620
17621   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17622   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17623   SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17624   if (Op.getSimpleValueType() == MVT::i1)
17625     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17626   return SetCC;
17627 }
17628
17629 /// Return true if opcode is a X86 logical comparison.
17630 static bool isX86LogicalCmp(SDValue Op) {
17631   unsigned Opc = Op.getOpcode();
17632   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17633       Opc == X86ISD::SAHF)
17634     return true;
17635   if (Op.getResNo() == 1 &&
17636       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17637        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17638        Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17639        Opc == X86ISD::XOR || Opc == X86ISD::AND))
17640     return true;
17641
17642   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17643     return true;
17644
17645   return false;
17646 }
17647
17648 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17649   if (V.getOpcode() != ISD::TRUNCATE)
17650     return false;
17651
17652   SDValue VOp0 = V.getOperand(0);
17653   unsigned InBits = VOp0.getValueSizeInBits();
17654   unsigned Bits = V.getValueSizeInBits();
17655   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17656 }
17657
17658 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17659   bool AddTest = true;
17660   SDValue Cond  = Op.getOperand(0);
17661   SDValue Op1 = Op.getOperand(1);
17662   SDValue Op2 = Op.getOperand(2);
17663   SDLoc DL(Op);
17664   MVT VT = Op1.getSimpleValueType();
17665   SDValue CC;
17666
17667   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17668   // are available or VBLENDV if AVX is available.
17669   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17670   if (Cond.getOpcode() == ISD::SETCC &&
17671       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17672        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17673       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17674     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17675     int SSECC = translateX86FSETCC(
17676         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17677
17678     if (SSECC != 8) {
17679       if (Subtarget.hasAVX512()) {
17680         SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17681                                   CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17682         return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17683                            DL, VT, Cmp, Op1, Op2);
17684       }
17685
17686       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17687                                 DAG.getConstant(SSECC, DL, MVT::i8));
17688
17689       // If we have AVX, we can use a variable vector select (VBLENDV) instead
17690       // of 3 logic instructions for size savings and potentially speed.
17691       // Unfortunately, there is no scalar form of VBLENDV.
17692
17693       // If either operand is a constant, don't try this. We can expect to
17694       // optimize away at least one of the logic instructions later in that
17695       // case, so that sequence would be faster than a variable blend.
17696
17697       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17698       // uses XMM0 as the selection register. That may need just as many
17699       // instructions as the AND/ANDN/OR sequence due to register moves, so
17700       // don't bother.
17701
17702       if (Subtarget.hasAVX() &&
17703           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17704
17705         // Convert to vectors, do a VSELECT, and convert back to scalar.
17706         // All of the conversions should be optimized away.
17707
17708         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17709         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17710         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17711         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17712
17713         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17714         VCmp = DAG.getBitcast(VCmpVT, VCmp);
17715
17716         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17717
17718         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17719                            VSel, DAG.getIntPtrConstant(0, DL));
17720       }
17721       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17722       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17723       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17724     }
17725   }
17726
17727   // AVX512 fallback is to lower selects of scalar floats to masked moves.
17728   if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17729     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17730     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17731   }
17732
17733   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17734     SDValue Op1Scalar;
17735     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17736       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17737     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17738       Op1Scalar = Op1.getOperand(0);
17739     SDValue Op2Scalar;
17740     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17741       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17742     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17743       Op2Scalar = Op2.getOperand(0);
17744     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17745       SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17746                                         Op1Scalar, Op2Scalar);
17747       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17748         return DAG.getBitcast(VT, newSelect);
17749       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17750       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17751                          DAG.getIntPtrConstant(0, DL));
17752     }
17753   }
17754
17755   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17756     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17757     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17758                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17759     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17760                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17761     SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17762     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17763   }
17764
17765   if (Cond.getOpcode() == ISD::SETCC) {
17766     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17767       Cond = NewCond;
17768       // If the condition was updated, it's possible that the operands of the
17769       // select were also updated (for example, EmitTest has a RAUW). Refresh
17770       // the local references to the select operands in case they got stale.
17771       Op1 = Op.getOperand(1);
17772       Op2 = Op.getOperand(2);
17773     }
17774   }
17775
17776   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17777   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17778   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17779   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17780   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17781   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17782   if (Cond.getOpcode() == X86ISD::SETCC &&
17783       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17784       isNullConstant(Cond.getOperand(1).getOperand(1))) {
17785     SDValue Cmp = Cond.getOperand(1);
17786     unsigned CondCode =
17787         cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17788
17789     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17790         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17791       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17792       SDValue CmpOp0 = Cmp.getOperand(0);
17793
17794       // Apply further optimizations for special cases
17795       // (select (x != 0), -1, 0) -> neg & sbb
17796       // (select (x == 0), 0, -1) -> neg & sbb
17797       if (isNullConstant(Y) &&
17798           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17799         SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17800         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17801         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17802         SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17803                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
17804                                   SDValue(Neg.getNode(), 1));
17805         return Res;
17806       }
17807
17808       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17809                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17810       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17811
17812       SDValue Res =   // Res = 0 or -1.
17813         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17814                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17815
17816       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17817         Res = DAG.getNOT(DL, Res, Res.getValueType());
17818
17819       if (!isNullConstant(Op2))
17820         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17821       return Res;
17822     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17823                Cmp.getOperand(0).getOpcode() == ISD::AND &&
17824                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17825       SDValue CmpOp0 = Cmp.getOperand(0);
17826       SDValue Src1, Src2;
17827       // true if Op2 is XOR or OR operator and one of its operands
17828       // is equal to Op1
17829       // ( a , a op b) || ( b , a op b)
17830       auto isOrXorPattern = [&]() {
17831         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17832             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17833           Src1 =
17834               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17835           Src2 = Op1;
17836           return true;
17837         }
17838         return false;
17839       };
17840
17841       if (isOrXorPattern()) {
17842         SDValue Neg;
17843         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17844         // we need mask of all zeros or ones with same size of the other
17845         // operands.
17846         if (CmpSz > VT.getSizeInBits())
17847           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17848         else if (CmpSz < VT.getSizeInBits())
17849           Neg = DAG.getNode(ISD::AND, DL, VT,
17850               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17851               DAG.getConstant(1, DL, VT));
17852         else
17853           Neg = CmpOp0;
17854         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17855                                    Neg); // -(and (x, 0x1))
17856         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17857         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
17858       }
17859     }
17860   }
17861
17862   // Look past (and (setcc_carry (cmp ...)), 1).
17863   if (Cond.getOpcode() == ISD::AND &&
17864       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17865       isOneConstant(Cond.getOperand(1)))
17866     Cond = Cond.getOperand(0);
17867
17868   // If condition flag is set by a X86ISD::CMP, then use it as the condition
17869   // setting operand in place of the X86ISD::SETCC.
17870   unsigned CondOpcode = Cond.getOpcode();
17871   if (CondOpcode == X86ISD::SETCC ||
17872       CondOpcode == X86ISD::SETCC_CARRY) {
17873     CC = Cond.getOperand(0);
17874
17875     SDValue Cmp = Cond.getOperand(1);
17876     unsigned Opc = Cmp.getOpcode();
17877     MVT VT = Op.getSimpleValueType();
17878
17879     bool IllegalFPCMov = false;
17880     if (VT.isFloatingPoint() && !VT.isVector() &&
17881         !isScalarFPTypeInSSEReg(VT))  // FPStack?
17882       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17883
17884     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17885         Opc == X86ISD::BT) { // FIXME
17886       Cond = Cmp;
17887       AddTest = false;
17888     }
17889   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17890              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17891              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17892               Cond.getOperand(0).getValueType() != MVT::i8)) {
17893     SDValue LHS = Cond.getOperand(0);
17894     SDValue RHS = Cond.getOperand(1);
17895     unsigned X86Opcode;
17896     unsigned X86Cond;
17897     SDVTList VTs;
17898     switch (CondOpcode) {
17899     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17900     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17901     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17902     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17903     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17904     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17905     default: llvm_unreachable("unexpected overflowing operator");
17906     }
17907     if (CondOpcode == ISD::UMULO)
17908       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17909                           MVT::i32);
17910     else
17911       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17912
17913     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17914
17915     if (CondOpcode == ISD::UMULO)
17916       Cond = X86Op.getValue(2);
17917     else
17918       Cond = X86Op.getValue(1);
17919
17920     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17921     AddTest = false;
17922   }
17923
17924   if (AddTest) {
17925     // Look past the truncate if the high bits are known zero.
17926     if (isTruncWithZeroHighBitsInput(Cond, DAG))
17927       Cond = Cond.getOperand(0);
17928
17929     // We know the result of AND is compared against zero. Try to match
17930     // it to BT.
17931     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17932       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17933         CC = NewSetCC.getOperand(0);
17934         Cond = NewSetCC.getOperand(1);
17935         AddTest = false;
17936       }
17937     }
17938   }
17939
17940   if (AddTest) {
17941     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17942     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17943   }
17944
17945   // a <  b ? -1 :  0 -> RES = ~setcc_carry
17946   // a <  b ?  0 : -1 -> RES = setcc_carry
17947   // a >= b ? -1 :  0 -> RES = setcc_carry
17948   // a >= b ?  0 : -1 -> RES = ~setcc_carry
17949   if (Cond.getOpcode() == X86ISD::SUB) {
17950     Cond = ConvertCmpIfNecessary(Cond, DAG);
17951     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17952
17953     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17954         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17955         (isNullConstant(Op1) || isNullConstant(Op2))) {
17956       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17957                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17958                                 Cond);
17959       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17960         return DAG.getNOT(DL, Res, Res.getValueType());
17961       return Res;
17962     }
17963   }
17964
17965   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17966   // widen the cmov and push the truncate through. This avoids introducing a new
17967   // branch during isel and doesn't add any extensions.
17968   if (Op.getValueType() == MVT::i8 &&
17969       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17970     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17971     if (T1.getValueType() == T2.getValueType() &&
17972         // Blacklist CopyFromReg to avoid partial register stalls.
17973         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17974       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17975       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17976       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17977     }
17978   }
17979
17980   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17981   // condition is true.
17982   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17983   SDValue Ops[] = { Op2, Op1, CC, Cond };
17984   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17985 }
17986
17987 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17988                                        const X86Subtarget &Subtarget,
17989                                        SelectionDAG &DAG) {
17990   MVT VT = Op->getSimpleValueType(0);
17991   SDValue In = Op->getOperand(0);
17992   MVT InVT = In.getSimpleValueType();
17993   MVT VTElt = VT.getVectorElementType();
17994   MVT InVTElt = InVT.getVectorElementType();
17995   SDLoc dl(Op);
17996
17997   // SKX processor
17998   if ((InVTElt == MVT::i1) &&
17999       (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
18000
18001        ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
18002
18003     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18004
18005   unsigned NumElts = VT.getVectorNumElements();
18006
18007   if (VT.is512BitVector() && InVTElt != MVT::i1 &&
18008       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
18009     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
18010       return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
18011     return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
18012   }
18013
18014   if (InVTElt != MVT::i1)
18015     return SDValue();
18016
18017   MVT ExtVT = VT;
18018   if (!VT.is512BitVector() && !Subtarget.hasVLX())
18019     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
18020
18021   SDValue V;
18022   if (Subtarget.hasDQI()) {
18023     V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
18024     assert(!VT.is512BitVector() && "Unexpected vector type");
18025   } else {
18026     SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
18027     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
18028     V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
18029     if (ExtVT == VT)
18030       return V;
18031   }
18032
18033   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
18034 }
18035
18036 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18037 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18038 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18039 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18040 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18041                                         const X86Subtarget &Subtarget,
18042                                         SelectionDAG &DAG) {
18043   SDValue In = Op->getOperand(0);
18044   MVT VT = Op->getSimpleValueType(0);
18045   MVT InVT = In.getSimpleValueType();
18046   assert(VT.getSizeInBits() == InVT.getSizeInBits());
18047
18048   MVT SVT = VT.getVectorElementType();
18049   MVT InSVT = InVT.getVectorElementType();
18050   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18051
18052   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18053     return SDValue();
18054   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18055     return SDValue();
18056   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18057       !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18058       !(VT.is512BitVector() && Subtarget.hasAVX512()))
18059     return SDValue();
18060
18061   SDLoc dl(Op);
18062
18063   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18064   // For 512-bit vectors, we need 128-bits or 256-bits.
18065   if (VT.getSizeInBits() > 128) {
18066     // Input needs to be at least the same number of elements as output, and
18067     // at least 128-bits.
18068     int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18069     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18070   }
18071
18072   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18073           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18074
18075   // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18076   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18077   // need to be handled here for 256/512-bit results.
18078   if (Subtarget.hasInt256()) {
18079     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18080     unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18081                         X86ISD::VSEXT : X86ISD::VZEXT;
18082     return DAG.getNode(ExtOpc, dl, VT, In);
18083   }
18084
18085   // We should only get here for sign extend.
18086   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18087          "Unexpected opcode!");
18088
18089   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18090   SDValue Curr = In;
18091   MVT CurrVT = InVT;
18092
18093   // As SRAI is only available on i16/i32 types, we expand only up to i32
18094   // and handle i64 separately.
18095   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18096     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18097     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18098     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18099     Curr = DAG.getBitcast(CurrVT, Curr);
18100   }
18101
18102   SDValue SignExt = Curr;
18103   if (CurrVT != InVT) {
18104     unsigned SignExtShift =
18105         CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18106     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18107                           DAG.getConstant(SignExtShift, dl, MVT::i8));
18108   }
18109
18110   if (CurrVT == VT)
18111     return SignExt;
18112
18113   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18114     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18115                                DAG.getConstant(31, dl, MVT::i8));
18116     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18117     return DAG.getBitcast(VT, Ext);
18118   }
18119
18120   return SDValue();
18121 }
18122
18123 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18124                                 SelectionDAG &DAG) {
18125   MVT VT = Op->getSimpleValueType(0);
18126   SDValue In = Op->getOperand(0);
18127   MVT InVT = In.getSimpleValueType();
18128   SDLoc dl(Op);
18129
18130   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
18131     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18132
18133   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18134       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18135       (VT != MVT::v16i16 || InVT != MVT::v16i8))
18136     return SDValue();
18137
18138   if (Subtarget.hasInt256())
18139     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18140
18141   // Optimize vectors in AVX mode
18142   // Sign extend  v8i16 to v8i32 and
18143   //              v4i32 to v4i64
18144   //
18145   // Divide input vector into two parts
18146   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18147   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18148   // concat the vectors to original VT
18149
18150   unsigned NumElems = InVT.getVectorNumElements();
18151   SDValue Undef = DAG.getUNDEF(InVT);
18152
18153   SmallVector<int,8> ShufMask1(NumElems, -1);
18154   for (unsigned i = 0; i != NumElems/2; ++i)
18155     ShufMask1[i] = i;
18156
18157   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18158
18159   SmallVector<int,8> ShufMask2(NumElems, -1);
18160   for (unsigned i = 0; i != NumElems/2; ++i)
18161     ShufMask2[i] = i + NumElems/2;
18162
18163   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18164
18165   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18166                                 VT.getVectorNumElements() / 2);
18167
18168   OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18169   OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18170
18171   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18172 }
18173
18174 // Lower truncating store. We need a special lowering to vXi1 vectors
18175 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18176                                     SelectionDAG &DAG) {
18177   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18178   SDLoc dl(St);
18179   EVT MemVT = St->getMemoryVT();
18180   assert(St->isTruncatingStore() && "We only custom truncating store.");
18181   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18182          "Expected truncstore of i1 vector");
18183
18184   SDValue Op = St->getValue();
18185   MVT OpVT = Op.getValueType().getSimpleVT();
18186   unsigned NumElts = OpVT.getVectorNumElements();
18187   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18188       NumElts == 16) {
18189     // Truncate and store - everything is legal
18190     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18191     if (MemVT.getSizeInBits() < 8)
18192       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18193                        DAG.getUNDEF(MVT::v8i1), Op,
18194                        DAG.getIntPtrConstant(0, dl));
18195     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18196                         St->getMemOperand());
18197   }
18198
18199   // A subset, assume that we have only AVX-512F
18200   if (NumElts <= 8) {
18201     if (NumElts < 8) {
18202       // Extend to 8-elts vector
18203       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18204       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18205                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18206     }
18207     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18208     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18209                         St->getMemOperand());
18210   }
18211   // v32i8
18212   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18213   // Divide the vector into 2 parts and store each part separately
18214   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18215                             DAG.getIntPtrConstant(0, dl));
18216   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18217   SDValue BasePtr = St->getBasePtr();
18218   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18219                               St->getMemOperand());
18220   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18221                             DAG.getIntPtrConstant(16, dl));
18222   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18223
18224   SDValue BasePtrHi =
18225     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18226                 DAG.getConstant(2, dl, BasePtr.getValueType()));
18227
18228   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18229                               BasePtrHi, St->getMemOperand());
18230   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18231 }
18232
18233 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18234                                            const X86Subtarget &Subtarget,
18235                                            SelectionDAG &DAG) {
18236
18237   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18238   SDLoc dl(Ld);
18239   EVT MemVT = Ld->getMemoryVT();
18240   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18241          "Expected i1 vector load");
18242   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18243     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18244   MVT VT = Op.getValueType().getSimpleVT();
18245   unsigned NumElts = VT.getVectorNumElements();
18246
18247   if ((Subtarget.hasBWI() && NumElts >= 32) ||
18248       (Subtarget.hasDQI() && NumElts < 16) ||
18249       NumElts == 16) {
18250     // Load and extend - everything is legal
18251     if (NumElts < 8) {
18252       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18253                                  Ld->getBasePtr(),
18254                                  Ld->getMemOperand());
18255       // Replace chain users with the new chain.
18256       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18257       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18258       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18259       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18260
18261       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18262                                    DAG.getIntPtrConstant(0, dl));
18263     }
18264     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18265                                Ld->getBasePtr(),
18266                                Ld->getMemOperand());
18267     // Replace chain users with the new chain.
18268     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18269     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18270
18271     // Finally, do a normal sign-extend to the desired register.
18272     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18273   }
18274
18275   if (NumElts <= 8) {
18276     // A subset, assume that we have only AVX-512F
18277     unsigned NumBitsToLoad = 8;
18278     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18279     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18280                               Ld->getBasePtr(),
18281                               Ld->getMemOperand());
18282     // Replace chain users with the new chain.
18283     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18284     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18285
18286     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18287     SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18288
18289     if (NumElts == 8)
18290       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18291
18292       // we should take care to v4i1 and v2i1
18293
18294     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18295     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18296     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18297                         DAG.getIntPtrConstant(0, dl));
18298   }
18299
18300   assert(VT == MVT::v32i8 && "Unexpected extload type");
18301
18302   SmallVector<SDValue, 2> Chains;
18303
18304   SDValue BasePtr = Ld->getBasePtr();
18305   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18306                                Ld->getBasePtr(),
18307                                Ld->getMemOperand());
18308   Chains.push_back(LoadLo.getValue(1));
18309
18310   SDValue BasePtrHi =
18311     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18312                 DAG.getConstant(2, dl, BasePtr.getValueType()));
18313
18314   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18315                                BasePtrHi,
18316                                Ld->getMemOperand());
18317   Chains.push_back(LoadHi.getValue(1));
18318   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18319   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18320
18321   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18322   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18323   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18324 }
18325
18326 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18327 // may emit an illegal shuffle but the expansion is still better than scalar
18328 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18329 // we'll emit a shuffle and a arithmetic shift.
18330 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18331 // TODO: It is possible to support ZExt by zeroing the undef values during
18332 // the shuffle phase or after the shuffle.
18333 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18334                                  SelectionDAG &DAG) {
18335   MVT RegVT = Op.getSimpleValueType();
18336   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18337   assert(RegVT.isInteger() &&
18338          "We only custom lower integer vector sext loads.");
18339
18340   // Nothing useful we can do without SSE2 shuffles.
18341   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18342
18343   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18344   SDLoc dl(Ld);
18345   EVT MemVT = Ld->getMemoryVT();
18346   if (MemVT.getScalarType() == MVT::i1)
18347     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18348
18349   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18350   unsigned RegSz = RegVT.getSizeInBits();
18351
18352   ISD::LoadExtType Ext = Ld->getExtensionType();
18353
18354   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18355          && "Only anyext and sext are currently implemented.");
18356   assert(MemVT != RegVT && "Cannot extend to the same type");
18357   assert(MemVT.isVector() && "Must load a vector from memory");
18358
18359   unsigned NumElems = RegVT.getVectorNumElements();
18360   unsigned MemSz = MemVT.getSizeInBits();
18361   assert(RegSz > MemSz && "Register size must be greater than the mem size");
18362
18363   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18364     // The only way in which we have a legal 256-bit vector result but not the
18365     // integer 256-bit operations needed to directly lower a sextload is if we
18366     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18367     // a 128-bit vector and a normal sign_extend to 256-bits that should get
18368     // correctly legalized. We do this late to allow the canonical form of
18369     // sextload to persist throughout the rest of the DAG combiner -- it wants
18370     // to fold together any extensions it can, and so will fuse a sign_extend
18371     // of an sextload into a sextload targeting a wider value.
18372     SDValue Load;
18373     if (MemSz == 128) {
18374       // Just switch this to a normal load.
18375       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18376                                        "it must be a legal 128-bit vector "
18377                                        "type!");
18378       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18379                          Ld->getPointerInfo(), Ld->getAlignment(),
18380                          Ld->getMemOperand()->getFlags());
18381     } else {
18382       assert(MemSz < 128 &&
18383              "Can't extend a type wider than 128 bits to a 256 bit vector!");
18384       // Do an sext load to a 128-bit vector type. We want to use the same
18385       // number of elements, but elements half as wide. This will end up being
18386       // recursively lowered by this routine, but will succeed as we definitely
18387       // have all the necessary features if we're using AVX1.
18388       EVT HalfEltVT =
18389           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18390       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18391       Load =
18392           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18393                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18394                          Ld->getMemOperand()->getFlags());
18395     }
18396
18397     // Replace chain users with the new chain.
18398     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18399     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18400
18401     // Finally, do a normal sign-extend to the desired register.
18402     return DAG.getSExtOrTrunc(Load, dl, RegVT);
18403   }
18404
18405   // All sizes must be a power of two.
18406   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18407          "Non-power-of-two elements are not custom lowered!");
18408
18409   // Attempt to load the original value using scalar loads.
18410   // Find the largest scalar type that divides the total loaded size.
18411   MVT SclrLoadTy = MVT::i8;
18412   for (MVT Tp : MVT::integer_valuetypes()) {
18413     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18414       SclrLoadTy = Tp;
18415     }
18416   }
18417
18418   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18419   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18420       (64 <= MemSz))
18421     SclrLoadTy = MVT::f64;
18422
18423   // Calculate the number of scalar loads that we need to perform
18424   // in order to load our vector from memory.
18425   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18426
18427   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18428          "Can only lower sext loads with a single scalar load!");
18429
18430   unsigned loadRegZize = RegSz;
18431   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18432     loadRegZize = 128;
18433
18434   // Represent our vector as a sequence of elements which are the
18435   // largest scalar that we can load.
18436   EVT LoadUnitVecVT = EVT::getVectorVT(
18437       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18438
18439   // Represent the data using the same element type that is stored in
18440   // memory. In practice, we ''widen'' MemVT.
18441   EVT WideVecVT =
18442       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18443                        loadRegZize / MemVT.getScalarSizeInBits());
18444
18445   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18446          "Invalid vector type");
18447
18448   // We can't shuffle using an illegal type.
18449   assert(TLI.isTypeLegal(WideVecVT) &&
18450          "We only lower types that form legal widened vector types");
18451
18452   SmallVector<SDValue, 8> Chains;
18453   SDValue Ptr = Ld->getBasePtr();
18454   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18455                                       TLI.getPointerTy(DAG.getDataLayout()));
18456   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18457
18458   for (unsigned i = 0; i < NumLoads; ++i) {
18459     // Perform a single load.
18460     SDValue ScalarLoad =
18461         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18462                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18463     Chains.push_back(ScalarLoad.getValue(1));
18464     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18465     // another round of DAGCombining.
18466     if (i == 0)
18467       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18468     else
18469       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18470                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
18471
18472     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18473   }
18474
18475   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18476
18477   // Bitcast the loaded value to a vector of the original element type, in
18478   // the size of the target vector type.
18479   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18480   unsigned SizeRatio = RegSz / MemSz;
18481
18482   if (Ext == ISD::SEXTLOAD) {
18483     // If we have SSE4.1, we can directly emit a VSEXT node.
18484     if (Subtarget.hasSSE41()) {
18485       SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18486       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18487       return Sext;
18488     }
18489
18490     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18491     // lanes.
18492     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18493            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18494
18495     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18496     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18497     return Shuff;
18498   }
18499
18500   // Redistribute the loaded elements into the different locations.
18501   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18502   for (unsigned i = 0; i != NumElems; ++i)
18503     ShuffleVec[i * SizeRatio] = i;
18504
18505   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18506                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
18507
18508   // Bitcast to the requested type.
18509   Shuff = DAG.getBitcast(RegVT, Shuff);
18510   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18511   return Shuff;
18512 }
18513
18514 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18515 /// each of which has no other use apart from the AND / OR.
18516 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18517   Opc = Op.getOpcode();
18518   if (Opc != ISD::OR && Opc != ISD::AND)
18519     return false;
18520   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18521           Op.getOperand(0).hasOneUse() &&
18522           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18523           Op.getOperand(1).hasOneUse());
18524 }
18525
18526 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18527 /// SETCC node has a single use.
18528 static bool isXor1OfSetCC(SDValue Op) {
18529   if (Op.getOpcode() != ISD::XOR)
18530     return false;
18531   if (isOneConstant(Op.getOperand(1)))
18532     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18533            Op.getOperand(0).hasOneUse();
18534   return false;
18535 }
18536
18537 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18538   bool addTest = true;
18539   SDValue Chain = Op.getOperand(0);
18540   SDValue Cond  = Op.getOperand(1);
18541   SDValue Dest  = Op.getOperand(2);
18542   SDLoc dl(Op);
18543   SDValue CC;
18544   bool Inverted = false;
18545
18546   if (Cond.getOpcode() == ISD::SETCC) {
18547     // Check for setcc([su]{add,sub,mul}o == 0).
18548     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18549         isNullConstant(Cond.getOperand(1)) &&
18550         Cond.getOperand(0).getResNo() == 1 &&
18551         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18552          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18553          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18554          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18555          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18556          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18557       Inverted = true;
18558       Cond = Cond.getOperand(0);
18559     } else {
18560       if (SDValue NewCond = LowerSETCC(Cond, DAG))
18561         Cond = NewCond;
18562     }
18563   }
18564 #if 0
18565   // FIXME: LowerXALUO doesn't handle these!!
18566   else if (Cond.getOpcode() == X86ISD::ADD  ||
18567            Cond.getOpcode() == X86ISD::SUB  ||
18568            Cond.getOpcode() == X86ISD::SMUL ||
18569            Cond.getOpcode() == X86ISD::UMUL)
18570     Cond = LowerXALUO(Cond, DAG);
18571 #endif
18572
18573   // Look pass (and (setcc_carry (cmp ...)), 1).
18574   if (Cond.getOpcode() == ISD::AND &&
18575       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18576       isOneConstant(Cond.getOperand(1)))
18577     Cond = Cond.getOperand(0);
18578
18579   // If condition flag is set by a X86ISD::CMP, then use it as the condition
18580   // setting operand in place of the X86ISD::SETCC.
18581   unsigned CondOpcode = Cond.getOpcode();
18582   if (CondOpcode == X86ISD::SETCC ||
18583       CondOpcode == X86ISD::SETCC_CARRY) {
18584     CC = Cond.getOperand(0);
18585
18586     SDValue Cmp = Cond.getOperand(1);
18587     unsigned Opc = Cmp.getOpcode();
18588     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18589     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18590       Cond = Cmp;
18591       addTest = false;
18592     } else {
18593       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18594       default: break;
18595       case X86::COND_O:
18596       case X86::COND_B:
18597         // These can only come from an arithmetic instruction with overflow,
18598         // e.g. SADDO, UADDO.
18599         Cond = Cond.getOperand(1);
18600         addTest = false;
18601         break;
18602       }
18603     }
18604   }
18605   CondOpcode = Cond.getOpcode();
18606   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18607       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18608       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18609        Cond.getOperand(0).getValueType() != MVT::i8)) {
18610     SDValue LHS = Cond.getOperand(0);
18611     SDValue RHS = Cond.getOperand(1);
18612     unsigned X86Opcode;
18613     unsigned X86Cond;
18614     SDVTList VTs;
18615     // Keep this in sync with LowerXALUO, otherwise we might create redundant
18616     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18617     // X86ISD::INC).
18618     switch (CondOpcode) {
18619     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18620     case ISD::SADDO:
18621       if (isOneConstant(RHS)) {
18622           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18623           break;
18624         }
18625       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18626     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18627     case ISD::SSUBO:
18628       if (isOneConstant(RHS)) {
18629           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18630           break;
18631         }
18632       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18633     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18634     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18635     default: llvm_unreachable("unexpected overflowing operator");
18636     }
18637     if (Inverted)
18638       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18639     if (CondOpcode == ISD::UMULO)
18640       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18641                           MVT::i32);
18642     else
18643       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18644
18645     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18646
18647     if (CondOpcode == ISD::UMULO)
18648       Cond = X86Op.getValue(2);
18649     else
18650       Cond = X86Op.getValue(1);
18651
18652     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18653     addTest = false;
18654   } else {
18655     unsigned CondOpc;
18656     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18657       SDValue Cmp = Cond.getOperand(0).getOperand(1);
18658       if (CondOpc == ISD::OR) {
18659         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18660         // two branches instead of an explicit OR instruction with a
18661         // separate test.
18662         if (Cmp == Cond.getOperand(1).getOperand(1) &&
18663             isX86LogicalCmp(Cmp)) {
18664           CC = Cond.getOperand(0).getOperand(0);
18665           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18666                               Chain, Dest, CC, Cmp);
18667           CC = Cond.getOperand(1).getOperand(0);
18668           Cond = Cmp;
18669           addTest = false;
18670         }
18671       } else { // ISD::AND
18672         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18673         // two branches instead of an explicit AND instruction with a
18674         // separate test. However, we only do this if this block doesn't
18675         // have a fall-through edge, because this requires an explicit
18676         // jmp when the condition is false.
18677         if (Cmp == Cond.getOperand(1).getOperand(1) &&
18678             isX86LogicalCmp(Cmp) &&
18679             Op.getNode()->hasOneUse()) {
18680           X86::CondCode CCode =
18681             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18682           CCode = X86::GetOppositeBranchCondition(CCode);
18683           CC = DAG.getConstant(CCode, dl, MVT::i8);
18684           SDNode *User = *Op.getNode()->use_begin();
18685           // Look for an unconditional branch following this conditional branch.
18686           // We need this because we need to reverse the successors in order
18687           // to implement FCMP_OEQ.
18688           if (User->getOpcode() == ISD::BR) {
18689             SDValue FalseBB = User->getOperand(1);
18690             SDNode *NewBR =
18691               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18692             assert(NewBR == User);
18693             (void)NewBR;
18694             Dest = FalseBB;
18695
18696             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18697                                 Chain, Dest, CC, Cmp);
18698             X86::CondCode CCode =
18699               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18700             CCode = X86::GetOppositeBranchCondition(CCode);
18701             CC = DAG.getConstant(CCode, dl, MVT::i8);
18702             Cond = Cmp;
18703             addTest = false;
18704           }
18705         }
18706       }
18707     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18708       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18709       // It should be transformed during dag combiner except when the condition
18710       // is set by a arithmetics with overflow node.
18711       X86::CondCode CCode =
18712         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18713       CCode = X86::GetOppositeBranchCondition(CCode);
18714       CC = DAG.getConstant(CCode, dl, MVT::i8);
18715       Cond = Cond.getOperand(0).getOperand(1);
18716       addTest = false;
18717     } else if (Cond.getOpcode() == ISD::SETCC &&
18718                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18719       // For FCMP_OEQ, we can emit
18720       // two branches instead of an explicit AND instruction with a
18721       // separate test. However, we only do this if this block doesn't
18722       // have a fall-through edge, because this requires an explicit
18723       // jmp when the condition is false.
18724       if (Op.getNode()->hasOneUse()) {
18725         SDNode *User = *Op.getNode()->use_begin();
18726         // Look for an unconditional branch following this conditional branch.
18727         // We need this because we need to reverse the successors in order
18728         // to implement FCMP_OEQ.
18729         if (User->getOpcode() == ISD::BR) {
18730           SDValue FalseBB = User->getOperand(1);
18731           SDNode *NewBR =
18732             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18733           assert(NewBR == User);
18734           (void)NewBR;
18735           Dest = FalseBB;
18736
18737           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18738                                     Cond.getOperand(0), Cond.getOperand(1));
18739           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18740           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18741           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18742                               Chain, Dest, CC, Cmp);
18743           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18744           Cond = Cmp;
18745           addTest = false;
18746         }
18747       }
18748     } else if (Cond.getOpcode() == ISD::SETCC &&
18749                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18750       // For FCMP_UNE, we can emit
18751       // two branches instead of an explicit AND instruction with a
18752       // separate test. However, we only do this if this block doesn't
18753       // have a fall-through edge, because this requires an explicit
18754       // jmp when the condition is false.
18755       if (Op.getNode()->hasOneUse()) {
18756         SDNode *User = *Op.getNode()->use_begin();
18757         // Look for an unconditional branch following this conditional branch.
18758         // We need this because we need to reverse the successors in order
18759         // to implement FCMP_UNE.
18760         if (User->getOpcode() == ISD::BR) {
18761           SDValue FalseBB = User->getOperand(1);
18762           SDNode *NewBR =
18763             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18764           assert(NewBR == User);
18765           (void)NewBR;
18766
18767           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18768                                     Cond.getOperand(0), Cond.getOperand(1));
18769           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18770           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18771           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18772                               Chain, Dest, CC, Cmp);
18773           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18774           Cond = Cmp;
18775           addTest = false;
18776           Dest = FalseBB;
18777         }
18778       }
18779     }
18780   }
18781
18782   if (addTest) {
18783     // Look pass the truncate if the high bits are known zero.
18784     if (isTruncWithZeroHighBitsInput(Cond, DAG))
18785         Cond = Cond.getOperand(0);
18786
18787     // We know the result is compared against zero. Try to match it to BT.
18788     if (Cond.hasOneUse()) {
18789       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18790         CC = NewSetCC.getOperand(0);
18791         Cond = NewSetCC.getOperand(1);
18792         addTest = false;
18793       }
18794     }
18795   }
18796
18797   if (addTest) {
18798     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18799     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18800     Cond = EmitTest(Cond, X86Cond, dl, DAG);
18801   }
18802   Cond = ConvertCmpIfNecessary(Cond, DAG);
18803   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18804                      Chain, Dest, CC, Cond);
18805 }
18806
18807 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18808 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18809 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18810 // that the guard pages used by the OS virtual memory manager are allocated in
18811 // correct sequence.
18812 SDValue
18813 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18814                                            SelectionDAG &DAG) const {
18815   MachineFunction &MF = DAG.getMachineFunction();
18816   bool SplitStack = MF.shouldSplitStack();
18817   bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18818   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18819                SplitStack || EmitStackProbe;
18820   SDLoc dl(Op);
18821
18822   // Get the inputs.
18823   SDNode *Node = Op.getNode();
18824   SDValue Chain = Op.getOperand(0);
18825   SDValue Size  = Op.getOperand(1);
18826   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18827   EVT VT = Node->getValueType(0);
18828
18829   // Chain the dynamic stack allocation so that it doesn't modify the stack
18830   // pointer when other instructions are using the stack.
18831   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18832
18833   bool Is64Bit = Subtarget.is64Bit();
18834   MVT SPTy = getPointerTy(DAG.getDataLayout());
18835
18836   SDValue Result;
18837   if (!Lower) {
18838     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18839     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18840     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18841                     " not tell us which reg is the stack pointer!");
18842
18843     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18844     Chain = SP.getValue(1);
18845     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18846     unsigned StackAlign = TFI.getStackAlignment();
18847     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18848     if (Align > StackAlign)
18849       Result = DAG.getNode(ISD::AND, dl, VT, Result,
18850                          DAG.getConstant(-(uint64_t)Align, dl, VT));
18851     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18852   } else if (SplitStack) {
18853     MachineRegisterInfo &MRI = MF.getRegInfo();
18854
18855     if (Is64Bit) {
18856       // The 64 bit implementation of segmented stacks needs to clobber both r10
18857       // r11. This makes it impossible to use it along with nested parameters.
18858       const Function *F = MF.getFunction();
18859       for (const auto &A : F->args()) {
18860         if (A.hasNestAttr())
18861           report_fatal_error("Cannot use segmented stacks with functions that "
18862                              "have nested arguments.");
18863       }
18864     }
18865
18866     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18867     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18868     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18869     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18870                                 DAG.getRegister(Vreg, SPTy));
18871   } else {
18872     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18873     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18874     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18875
18876     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18877     unsigned SPReg = RegInfo->getStackRegister();
18878     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18879     Chain = SP.getValue(1);
18880
18881     if (Align) {
18882       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18883                        DAG.getConstant(-(uint64_t)Align, dl, VT));
18884       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18885     }
18886
18887     Result = SP;
18888   }
18889
18890   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18891                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18892
18893   SDValue Ops[2] = {Result, Chain};
18894   return DAG.getMergeValues(Ops, dl);
18895 }
18896
18897 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18898   MachineFunction &MF = DAG.getMachineFunction();
18899   auto PtrVT = getPointerTy(MF.getDataLayout());
18900   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18901
18902   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18903   SDLoc DL(Op);
18904
18905   if (!Subtarget.is64Bit() ||
18906       Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18907     // vastart just stores the address of the VarArgsFrameIndex slot into the
18908     // memory location argument.
18909     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18910     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18911                         MachinePointerInfo(SV));
18912   }
18913
18914   // __va_list_tag:
18915   //   gp_offset         (0 - 6 * 8)
18916   //   fp_offset         (48 - 48 + 8 * 16)
18917   //   overflow_arg_area (point to parameters coming in memory).
18918   //   reg_save_area
18919   SmallVector<SDValue, 8> MemOps;
18920   SDValue FIN = Op.getOperand(1);
18921   // Store gp_offset
18922   SDValue Store = DAG.getStore(
18923       Op.getOperand(0), DL,
18924       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18925       MachinePointerInfo(SV));
18926   MemOps.push_back(Store);
18927
18928   // Store fp_offset
18929   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18930   Store = DAG.getStore(
18931       Op.getOperand(0), DL,
18932       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18933       MachinePointerInfo(SV, 4));
18934   MemOps.push_back(Store);
18935
18936   // Store ptr to overflow_arg_area
18937   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18938   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18939   Store =
18940       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18941   MemOps.push_back(Store);
18942
18943   // Store ptr to reg_save_area.
18944   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18945       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18946   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18947   Store = DAG.getStore(
18948       Op.getOperand(0), DL, RSFIN, FIN,
18949       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18950   MemOps.push_back(Store);
18951   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18952 }
18953
18954 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18955   assert(Subtarget.is64Bit() &&
18956          "LowerVAARG only handles 64-bit va_arg!");
18957   assert(Op.getNumOperands() == 4);
18958
18959   MachineFunction &MF = DAG.getMachineFunction();
18960   if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18961     // The Win64 ABI uses char* instead of a structure.
18962     return DAG.expandVAArg(Op.getNode());
18963
18964   SDValue Chain = Op.getOperand(0);
18965   SDValue SrcPtr = Op.getOperand(1);
18966   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18967   unsigned Align = Op.getConstantOperandVal(3);
18968   SDLoc dl(Op);
18969
18970   EVT ArgVT = Op.getNode()->getValueType(0);
18971   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18972   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18973   uint8_t ArgMode;
18974
18975   // Decide which area this value should be read from.
18976   // TODO: Implement the AMD64 ABI in its entirety. This simple
18977   // selection mechanism works only for the basic types.
18978   if (ArgVT == MVT::f80) {
18979     llvm_unreachable("va_arg for f80 not yet implemented");
18980   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18981     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
18982   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18983     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
18984   } else {
18985     llvm_unreachable("Unhandled argument type in LowerVAARG");
18986   }
18987
18988   if (ArgMode == 2) {
18989     // Sanity Check: Make sure using fp_offset makes sense.
18990     assert(!Subtarget.useSoftFloat() &&
18991            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18992            Subtarget.hasSSE1());
18993   }
18994
18995   // Insert VAARG_64 node into the DAG
18996   // VAARG_64 returns two values: Variable Argument Address, Chain
18997   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18998                        DAG.getConstant(ArgMode, dl, MVT::i8),
18999                        DAG.getConstant(Align, dl, MVT::i32)};
19000   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19001   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
19002                                           VTs, InstOps, MVT::i64,
19003                                           MachinePointerInfo(SV),
19004                                           /*Align=*/0,
19005                                           /*Volatile=*/false,
19006                                           /*ReadMem=*/true,
19007                                           /*WriteMem=*/true);
19008   Chain = VAARG.getValue(1);
19009
19010   // Load the next argument and return it
19011   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19012 }
19013
19014 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19015                            SelectionDAG &DAG) {
19016   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19017   // where a va_list is still an i8*.
19018   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19019   if (Subtarget.isCallingConvWin64(
19020         DAG.getMachineFunction().getFunction()->getCallingConv()))
19021     // Probably a Win64 va_copy.
19022     return DAG.expandVACopy(Op.getNode());
19023
19024   SDValue Chain = Op.getOperand(0);
19025   SDValue DstPtr = Op.getOperand(1);
19026   SDValue SrcPtr = Op.getOperand(2);
19027   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19028   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19029   SDLoc DL(Op);
19030
19031   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19032                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19033                        false, false,
19034                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19035 }
19036
19037 /// Handle vector element shifts where the shift amount is a constant.
19038 /// Takes immediate version of shift as input.
19039 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19040                                           SDValue SrcOp, uint64_t ShiftAmt,
19041                                           SelectionDAG &DAG) {
19042   MVT ElementType = VT.getVectorElementType();
19043
19044   // Bitcast the source vector to the output type, this is mainly necessary for
19045   // vXi8/vXi64 shifts.
19046   if (VT != SrcOp.getSimpleValueType())
19047     SrcOp = DAG.getBitcast(VT, SrcOp);
19048
19049   // Fold this packed shift into its first operand if ShiftAmt is 0.
19050   if (ShiftAmt == 0)
19051     return SrcOp;
19052
19053   // Check for ShiftAmt >= element width
19054   if (ShiftAmt >= ElementType.getSizeInBits()) {
19055     if (Opc == X86ISD::VSRAI)
19056       ShiftAmt = ElementType.getSizeInBits() - 1;
19057     else
19058       return DAG.getConstant(0, dl, VT);
19059   }
19060
19061   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19062          && "Unknown target vector shift-by-constant node");
19063
19064   // Fold this packed vector shift into a build vector if SrcOp is a
19065   // vector of Constants or UNDEFs.
19066   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19067     SmallVector<SDValue, 8> Elts;
19068     unsigned NumElts = SrcOp->getNumOperands();
19069     ConstantSDNode *ND;
19070
19071     switch(Opc) {
19072     default: llvm_unreachable("Unknown opcode!");
19073     case X86ISD::VSHLI:
19074       for (unsigned i=0; i!=NumElts; ++i) {
19075         SDValue CurrentOp = SrcOp->getOperand(i);
19076         if (CurrentOp->isUndef()) {
19077           Elts.push_back(CurrentOp);
19078           continue;
19079         }
19080         ND = cast<ConstantSDNode>(CurrentOp);
19081         const APInt &C = ND->getAPIntValue();
19082         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19083       }
19084       break;
19085     case X86ISD::VSRLI:
19086       for (unsigned i=0; i!=NumElts; ++i) {
19087         SDValue CurrentOp = SrcOp->getOperand(i);
19088         if (CurrentOp->isUndef()) {
19089           Elts.push_back(CurrentOp);
19090           continue;
19091         }
19092         ND = cast<ConstantSDNode>(CurrentOp);
19093         const APInt &C = ND->getAPIntValue();
19094         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19095       }
19096       break;
19097     case X86ISD::VSRAI:
19098       for (unsigned i=0; i!=NumElts; ++i) {
19099         SDValue CurrentOp = SrcOp->getOperand(i);
19100         if (CurrentOp->isUndef()) {
19101           Elts.push_back(CurrentOp);
19102           continue;
19103         }
19104         ND = cast<ConstantSDNode>(CurrentOp);
19105         const APInt &C = ND->getAPIntValue();
19106         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19107       }
19108       break;
19109     }
19110
19111     return DAG.getBuildVector(VT, dl, Elts);
19112   }
19113
19114   return DAG.getNode(Opc, dl, VT, SrcOp,
19115                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
19116 }
19117
19118 /// Handle vector element shifts where the shift amount may or may not be a
19119 /// constant. Takes immediate version of shift as input.
19120 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19121                                    SDValue SrcOp, SDValue ShAmt,
19122                                    const X86Subtarget &Subtarget,
19123                                    SelectionDAG &DAG) {
19124   MVT SVT = ShAmt.getSimpleValueType();
19125   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19126
19127   // Catch shift-by-constant.
19128   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19129     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19130                                       CShAmt->getZExtValue(), DAG);
19131
19132   // Change opcode to non-immediate version
19133   switch (Opc) {
19134     default: llvm_unreachable("Unknown target vector shift node");
19135     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19136     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19137     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19138   }
19139
19140   // Need to build a vector containing shift amount.
19141   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19142   // +=================+============+=======================================+
19143   // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
19144   // +=================+============+=======================================+
19145   // | i64             | Yes, No    | Use ShAmt as lowest elt               |
19146   // | i32             | Yes        | zero-extend in-reg                    |
19147   // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
19148   // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19149   // +=================+============+=======================================+
19150
19151   if (SVT == MVT::i64)
19152     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19153   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19154            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19155     ShAmt = ShAmt.getOperand(0);
19156     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19157     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19158   } else if (Subtarget.hasSSE41() &&
19159              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19160     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19161     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19162   } else {
19163     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
19164                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19165     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19166   }
19167
19168   // The return type has to be a 128-bit type with the same element
19169   // type as the input type.
19170   MVT EltVT = VT.getVectorElementType();
19171   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19172
19173   ShAmt = DAG.getBitcast(ShVT, ShAmt);
19174   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19175 }
19176
19177 /// \brief Return Mask with the necessary casting or extending
19178 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19179 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19180                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
19181                            const SDLoc &dl) {
19182
19183   if (isAllOnesConstant(Mask))
19184     return DAG.getTargetConstant(1, dl, MaskVT);
19185   if (X86::isZeroNode(Mask))
19186     return DAG.getTargetConstant(0, dl, MaskVT);
19187
19188   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19189     // Mask should be extended
19190     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19191                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19192   }
19193
19194   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19195     if (MaskVT == MVT::v64i1) {
19196       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19197       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19198       SDValue Lo, Hi;
19199       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19200                           DAG.getConstant(0, dl, MVT::i32));
19201       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19202                           DAG.getConstant(1, dl, MVT::i32));
19203
19204       Lo = DAG.getBitcast(MVT::v32i1, Lo);
19205       Hi = DAG.getBitcast(MVT::v32i1, Hi);
19206
19207       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19208     } else {
19209       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19210       // and bitcast.
19211       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19212       return DAG.getBitcast(MaskVT,
19213                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19214     }
19215
19216   } else {
19217     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19218                                      Mask.getSimpleValueType().getSizeInBits());
19219     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19220     // are extracted by EXTRACT_SUBVECTOR.
19221     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19222                        DAG.getBitcast(BitcastVT, Mask),
19223                        DAG.getIntPtrConstant(0, dl));
19224   }
19225 }
19226
19227 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19228 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19229 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19230 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19231                   SDValue PreservedSrc,
19232                   const X86Subtarget &Subtarget,
19233                   SelectionDAG &DAG) {
19234   MVT VT = Op.getSimpleValueType();
19235   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19236   unsigned OpcodeSelect = ISD::VSELECT;
19237   SDLoc dl(Op);
19238
19239   if (isAllOnesConstant(Mask))
19240     return Op;
19241
19242   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19243
19244   switch (Op.getOpcode()) {
19245   default: break;
19246   case X86ISD::PCMPEQM:
19247   case X86ISD::PCMPGTM:
19248   case X86ISD::CMPM:
19249   case X86ISD::CMPMU:
19250     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19251   case X86ISD::VFPCLASS:
19252     case X86ISD::VFPCLASSS:
19253     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19254   case X86ISD::VTRUNC:
19255   case X86ISD::VTRUNCS:
19256   case X86ISD::VTRUNCUS:
19257   case X86ISD::CVTPS2PH:
19258     // We can't use ISD::VSELECT here because it is not always "Legal"
19259     // for the destination type. For example vpmovqb require only AVX512
19260     // and vselect that can operate on byte element type require BWI
19261     OpcodeSelect = X86ISD::SELECT;
19262     break;
19263   }
19264   if (PreservedSrc.isUndef())
19265     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19266   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19267 }
19268
19269 /// \brief Creates an SDNode for a predicated scalar operation.
19270 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19271 /// The mask is coming as MVT::i8 and it should be transformed
19272 /// to MVT::v1i1 while lowering masking intrinsics.
19273 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19274 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19275 /// for a scalar instruction.
19276 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19277                                     SDValue PreservedSrc,
19278                                     const X86Subtarget &Subtarget,
19279                                     SelectionDAG &DAG) {
19280
19281   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19282     if (MaskConst->getZExtValue() & 0x1)
19283       return Op;
19284
19285   MVT VT = Op.getSimpleValueType();
19286   SDLoc dl(Op);
19287
19288   SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19289   if (Op.getOpcode() == X86ISD::FSETCCM ||
19290       Op.getOpcode() == X86ISD::FSETCCM_RND)
19291     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19292   if (Op.getOpcode() == X86ISD::VFPCLASSS)
19293     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19294
19295   if (PreservedSrc.isUndef())
19296     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19297   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19298 }
19299
19300 static int getSEHRegistrationNodeSize(const Function *Fn) {
19301   if (!Fn->hasPersonalityFn())
19302     report_fatal_error(
19303         "querying registration node size for function without personality");
19304   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19305   // WinEHStatePass for the full struct definition.
19306   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19307   case EHPersonality::MSVC_X86SEH: return 24;
19308   case EHPersonality::MSVC_CXX: return 16;
19309   default: break;
19310   }
19311   report_fatal_error(
19312       "can only recover FP for 32-bit MSVC EH personality functions");
19313 }
19314
19315 /// When the MSVC runtime transfers control to us, either to an outlined
19316 /// function or when returning to a parent frame after catching an exception, we
19317 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19318 /// Here's the math:
19319 ///   RegNodeBase = EntryEBP - RegNodeSize
19320 ///   ParentFP = RegNodeBase - ParentFrameOffset
19321 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19322 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19323 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19324                                    SDValue EntryEBP) {
19325   MachineFunction &MF = DAG.getMachineFunction();
19326   SDLoc dl;
19327
19328   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19329   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19330
19331   // It's possible that the parent function no longer has a personality function
19332   // if the exceptional code was optimized away, in which case we just return
19333   // the incoming EBP.
19334   if (!Fn->hasPersonalityFn())
19335     return EntryEBP;
19336
19337   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19338   // registration, or the .set_setframe offset.
19339   MCSymbol *OffsetSym =
19340       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19341           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19342   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19343   SDValue ParentFrameOffset =
19344       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19345
19346   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19347   // prologue to RBP in the parent function.
19348   const X86Subtarget &Subtarget =
19349       static_cast<const X86Subtarget &>(DAG.getSubtarget());
19350   if (Subtarget.is64Bit())
19351     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19352
19353   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19354   // RegNodeBase = EntryEBP - RegNodeSize
19355   // ParentFP = RegNodeBase - ParentFrameOffset
19356   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19357                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
19358   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19359 }
19360
19361 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19362                                        SelectionDAG &DAG) {
19363   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19364   auto isRoundModeCurDirection = [](SDValue Rnd) {
19365     if (!isa<ConstantSDNode>(Rnd))
19366       return false;
19367
19368     unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19369     return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19370   };
19371
19372   SDLoc dl(Op);
19373   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19374   MVT VT = Op.getSimpleValueType();
19375   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19376   if (IntrData) {
19377     switch(IntrData->Type) {
19378     case INTR_TYPE_1OP:
19379       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19380     case INTR_TYPE_2OP:
19381       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19382         Op.getOperand(2));
19383     case INTR_TYPE_3OP:
19384       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19385         Op.getOperand(2), Op.getOperand(3));
19386     case INTR_TYPE_4OP:
19387       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19388         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19389     case INTR_TYPE_1OP_MASK_RM: {
19390       SDValue Src = Op.getOperand(1);
19391       SDValue PassThru = Op.getOperand(2);
19392       SDValue Mask = Op.getOperand(3);
19393       SDValue RoundingMode;
19394       // We always add rounding mode to the Node.
19395       // If the rounding mode is not specified, we add the
19396       // "current direction" mode.
19397       if (Op.getNumOperands() == 4)
19398         RoundingMode =
19399           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19400       else
19401         RoundingMode = Op.getOperand(4);
19402       assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19403       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19404                                               RoundingMode),
19405                                   Mask, PassThru, Subtarget, DAG);
19406     }
19407     case INTR_TYPE_1OP_MASK: {
19408       SDValue Src = Op.getOperand(1);
19409       SDValue PassThru = Op.getOperand(2);
19410       SDValue Mask = Op.getOperand(3);
19411       // We add rounding mode to the Node when
19412       //   - RM Opcode is specified and
19413       //   - RM is not "current direction".
19414       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19415       if (IntrWithRoundingModeOpcode != 0) {
19416         SDValue Rnd = Op.getOperand(4);
19417         if (!isRoundModeCurDirection(Rnd)) {
19418           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19419                                       dl, Op.getValueType(),
19420                                       Src, Rnd),
19421                                       Mask, PassThru, Subtarget, DAG);
19422         }
19423       }
19424       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19425                                   Mask, PassThru, Subtarget, DAG);
19426     }
19427     case INTR_TYPE_SCALAR_MASK: {
19428       SDValue Src1 = Op.getOperand(1);
19429       SDValue Src2 = Op.getOperand(2);
19430       SDValue passThru = Op.getOperand(3);
19431       SDValue Mask = Op.getOperand(4);
19432       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19433       if (IntrWithRoundingModeOpcode != 0) {
19434         SDValue Rnd = Op.getOperand(5);
19435         if (!isRoundModeCurDirection(Rnd))
19436           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19437                                                   dl, VT, Src1, Src2, Rnd),
19438                                       Mask, passThru, Subtarget, DAG);
19439       }
19440       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19441                                   Mask, passThru, Subtarget, DAG);
19442     }
19443     case INTR_TYPE_SCALAR_MASK_RM: {
19444       SDValue Src1 = Op.getOperand(1);
19445       SDValue Src2 = Op.getOperand(2);
19446       SDValue Src0 = Op.getOperand(3);
19447       SDValue Mask = Op.getOperand(4);
19448       // There are 2 kinds of intrinsics in this group:
19449       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19450       // (2) With rounding mode and sae - 7 operands.
19451       if (Op.getNumOperands() == 6) {
19452         SDValue Sae  = Op.getOperand(5);
19453         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19454                                                 Sae),
19455                                     Mask, Src0, Subtarget, DAG);
19456       }
19457       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19458       SDValue RoundingMode  = Op.getOperand(5);
19459       SDValue Sae  = Op.getOperand(6);
19460       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19461                                               RoundingMode, Sae),
19462                                   Mask, Src0, Subtarget, DAG);
19463     }
19464     case INTR_TYPE_2OP_MASK:
19465     case INTR_TYPE_2OP_IMM8_MASK: {
19466       SDValue Src1 = Op.getOperand(1);
19467       SDValue Src2 = Op.getOperand(2);
19468       SDValue PassThru = Op.getOperand(3);
19469       SDValue Mask = Op.getOperand(4);
19470
19471       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19472         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19473
19474       // We specify 2 possible opcodes for intrinsics with rounding modes.
19475       // First, we check if the intrinsic may have non-default rounding mode,
19476       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19477       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19478       if (IntrWithRoundingModeOpcode != 0) {
19479         SDValue Rnd = Op.getOperand(5);
19480         if (!isRoundModeCurDirection(Rnd)) {
19481           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19482                                       dl, Op.getValueType(),
19483                                       Src1, Src2, Rnd),
19484                                       Mask, PassThru, Subtarget, DAG);
19485         }
19486       }
19487       // TODO: Intrinsics should have fast-math-flags to propagate.
19488       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19489                                   Mask, PassThru, Subtarget, DAG);
19490     }
19491     case INTR_TYPE_2OP_MASK_RM: {
19492       SDValue Src1 = Op.getOperand(1);
19493       SDValue Src2 = Op.getOperand(2);
19494       SDValue PassThru = Op.getOperand(3);
19495       SDValue Mask = Op.getOperand(4);
19496       // We specify 2 possible modes for intrinsics, with/without rounding
19497       // modes.
19498       // First, we check if the intrinsic have rounding mode (6 operands),
19499       // if not, we set rounding mode to "current".
19500       SDValue Rnd;
19501       if (Op.getNumOperands() == 6)
19502         Rnd = Op.getOperand(5);
19503       else
19504         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19505       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19506                                               Src1, Src2, Rnd),
19507                                   Mask, PassThru, Subtarget, DAG);
19508     }
19509     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19510       SDValue Src1 = Op.getOperand(1);
19511       SDValue Src2 = Op.getOperand(2);
19512       SDValue Src3 = Op.getOperand(3);
19513       SDValue PassThru = Op.getOperand(4);
19514       SDValue Mask = Op.getOperand(5);
19515       SDValue Sae  = Op.getOperand(6);
19516
19517       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19518                                               Src2, Src3, Sae),
19519                                   Mask, PassThru, Subtarget, DAG);
19520     }
19521     case INTR_TYPE_3OP_MASK_RM: {
19522       SDValue Src1 = Op.getOperand(1);
19523       SDValue Src2 = Op.getOperand(2);
19524       SDValue Imm = Op.getOperand(3);
19525       SDValue PassThru = Op.getOperand(4);
19526       SDValue Mask = Op.getOperand(5);
19527       // We specify 2 possible modes for intrinsics, with/without rounding
19528       // modes.
19529       // First, we check if the intrinsic have rounding mode (7 operands),
19530       // if not, we set rounding mode to "current".
19531       SDValue Rnd;
19532       if (Op.getNumOperands() == 7)
19533         Rnd = Op.getOperand(6);
19534       else
19535         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19536       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19537                                               Src1, Src2, Imm, Rnd),
19538                                   Mask, PassThru, Subtarget, DAG);
19539     }
19540     case INTR_TYPE_3OP_IMM8_MASK:
19541     case INTR_TYPE_3OP_MASK: {
19542       SDValue Src1 = Op.getOperand(1);
19543       SDValue Src2 = Op.getOperand(2);
19544       SDValue Src3 = Op.getOperand(3);
19545       SDValue PassThru = Op.getOperand(4);
19546       SDValue Mask = Op.getOperand(5);
19547
19548       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19549         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19550
19551       // We specify 2 possible opcodes for intrinsics with rounding modes.
19552       // First, we check if the intrinsic may have non-default rounding mode,
19553       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19554       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19555       if (IntrWithRoundingModeOpcode != 0) {
19556         SDValue Rnd = Op.getOperand(6);
19557         if (!isRoundModeCurDirection(Rnd)) {
19558           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19559                                       dl, Op.getValueType(),
19560                                       Src1, Src2, Src3, Rnd),
19561                                       Mask, PassThru, Subtarget, DAG);
19562         }
19563       }
19564       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19565                                               Src1, Src2, Src3),
19566                                   Mask, PassThru, Subtarget, DAG);
19567     }
19568     case VPERM_2OP_MASK : {
19569       SDValue Src1 = Op.getOperand(1);
19570       SDValue Src2 = Op.getOperand(2);
19571       SDValue PassThru = Op.getOperand(3);
19572       SDValue Mask = Op.getOperand(4);
19573
19574       // Swap Src1 and Src2 in the node creation
19575       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19576                                   Mask, PassThru, Subtarget, DAG);
19577     }
19578     case VPERM_3OP_MASKZ:
19579     case VPERM_3OP_MASK:{
19580       MVT VT = Op.getSimpleValueType();
19581       // Src2 is the PassThru
19582       SDValue Src1 = Op.getOperand(1);
19583       // PassThru needs to be the same type as the destination in order
19584       // to pattern match correctly.
19585       SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19586       SDValue Src3 = Op.getOperand(3);
19587       SDValue Mask = Op.getOperand(4);
19588       SDValue PassThru = SDValue();
19589
19590       // set PassThru element
19591       if (IntrData->Type == VPERM_3OP_MASKZ)
19592         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19593       else
19594         PassThru = Src2;
19595
19596       // Swap Src1 and Src2 in the node creation
19597       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19598                                               dl, Op.getValueType(),
19599                                               Src2, Src1, Src3),
19600                                   Mask, PassThru, Subtarget, DAG);
19601     }
19602     case FMA_OP_MASK3:
19603     case FMA_OP_MASKZ:
19604     case FMA_OP_MASK: {
19605       SDValue Src1 = Op.getOperand(1);
19606       SDValue Src2 = Op.getOperand(2);
19607       SDValue Src3 = Op.getOperand(3);
19608       SDValue Mask = Op.getOperand(4);
19609       MVT VT = Op.getSimpleValueType();
19610       SDValue PassThru = SDValue();
19611
19612       // set PassThru element
19613       if (IntrData->Type == FMA_OP_MASKZ)
19614         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19615       else if (IntrData->Type == FMA_OP_MASK3)
19616         PassThru = Src3;
19617       else
19618         PassThru = Src1;
19619
19620       // We specify 2 possible opcodes for intrinsics with rounding modes.
19621       // First, we check if the intrinsic may have non-default rounding mode,
19622       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19623       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19624       if (IntrWithRoundingModeOpcode != 0) {
19625         SDValue Rnd = Op.getOperand(5);
19626         if (!isRoundModeCurDirection(Rnd))
19627           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19628                                                   dl, Op.getValueType(),
19629                                                   Src1, Src2, Src3, Rnd),
19630                                       Mask, PassThru, Subtarget, DAG);
19631       }
19632       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19633                                               dl, Op.getValueType(),
19634                                               Src1, Src2, Src3),
19635                                   Mask, PassThru, Subtarget, DAG);
19636     }
19637     case FMA_OP_SCALAR_MASK:
19638     case FMA_OP_SCALAR_MASK3:
19639     case FMA_OP_SCALAR_MASKZ: {
19640       SDValue Src1 = Op.getOperand(1);
19641       SDValue Src2 = Op.getOperand(2);
19642       SDValue Src3 = Op.getOperand(3);
19643       SDValue Mask = Op.getOperand(4);
19644       MVT VT = Op.getSimpleValueType();
19645       SDValue PassThru = SDValue();
19646
19647       // set PassThru element
19648       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19649         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19650       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19651         PassThru = Src3;
19652       else
19653         PassThru = Src1;
19654
19655       SDValue Rnd = Op.getOperand(5);
19656       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19657                                               Op.getValueType(), Src1, Src2,
19658                                               Src3, Rnd),
19659                                   Mask, PassThru, Subtarget, DAG);
19660     }
19661     case TERLOG_OP_MASK:
19662     case TERLOG_OP_MASKZ: {
19663       SDValue Src1 = Op.getOperand(1);
19664       SDValue Src2 = Op.getOperand(2);
19665       SDValue Src3 = Op.getOperand(3);
19666       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19667       SDValue Mask = Op.getOperand(5);
19668       MVT VT = Op.getSimpleValueType();
19669       SDValue PassThru = Src1;
19670       // Set PassThru element.
19671       if (IntrData->Type == TERLOG_OP_MASKZ)
19672         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19673
19674       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19675                                               Src1, Src2, Src3, Src4),
19676                                   Mask, PassThru, Subtarget, DAG);
19677     }
19678     case CVTPD2PS:
19679       // ISD::FP_ROUND has a second argument that indicates if the truncation
19680       // does not change the value. Set it to 0 since it can change.
19681       return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19682                          DAG.getIntPtrConstant(0, dl));
19683     case CVTPD2PS_MASK: {
19684       SDValue Src = Op.getOperand(1);
19685       SDValue PassThru = Op.getOperand(2);
19686       SDValue Mask = Op.getOperand(3);
19687       // We add rounding mode to the Node when
19688       //   - RM Opcode is specified and
19689       //   - RM is not "current direction".
19690       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19691       if (IntrWithRoundingModeOpcode != 0) {
19692         SDValue Rnd = Op.getOperand(4);
19693         if (!isRoundModeCurDirection(Rnd)) {
19694           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19695                                       dl, Op.getValueType(),
19696                                       Src, Rnd),
19697                                       Mask, PassThru, Subtarget, DAG);
19698         }
19699       }
19700       assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19701       // ISD::FP_ROUND has a second argument that indicates if the truncation
19702       // does not change the value. Set it to 0 since it can change.
19703       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19704                                               DAG.getIntPtrConstant(0, dl)),
19705                                   Mask, PassThru, Subtarget, DAG);
19706     }
19707     case FPCLASS: {
19708       // FPclass intrinsics with mask
19709        SDValue Src1 = Op.getOperand(1);
19710        MVT VT = Src1.getSimpleValueType();
19711        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19712        SDValue Imm = Op.getOperand(2);
19713        SDValue Mask = Op.getOperand(3);
19714        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19715                                      Mask.getSimpleValueType().getSizeInBits());
19716        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19717        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19718                                                  DAG.getTargetConstant(0, dl, MaskVT),
19719                                                  Subtarget, DAG);
19720        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19721                                  DAG.getUNDEF(BitcastVT), FPclassMask,
19722                                  DAG.getIntPtrConstant(0, dl));
19723        return DAG.getBitcast(Op.getValueType(), Res);
19724     }
19725     case FPCLASSS: {
19726       SDValue Src1 = Op.getOperand(1);
19727       SDValue Imm = Op.getOperand(2);
19728       SDValue Mask = Op.getOperand(3);
19729       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19730       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19731         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19732       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19733                          DAG.getIntPtrConstant(0, dl));
19734     }
19735     case CMP_MASK:
19736     case CMP_MASK_CC: {
19737       // Comparison intrinsics with masks.
19738       // Example of transformation:
19739       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19740       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19741       // (i8 (bitcast
19742       //   (v8i1 (insert_subvector undef,
19743       //           (v2i1 (and (PCMPEQM %a, %b),
19744       //                      (extract_subvector
19745       //                         (v8i1 (bitcast %mask)), 0))), 0))))
19746       MVT VT = Op.getOperand(1).getSimpleValueType();
19747       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19748       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19749       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19750                                        Mask.getSimpleValueType().getSizeInBits());
19751       SDValue Cmp;
19752       if (IntrData->Type == CMP_MASK_CC) {
19753         SDValue CC = Op.getOperand(3);
19754         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19755         // We specify 2 possible opcodes for intrinsics with rounding modes.
19756         // First, we check if the intrinsic may have non-default rounding mode,
19757         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19758         if (IntrData->Opc1 != 0) {
19759           SDValue Rnd = Op.getOperand(5);
19760           if (!isRoundModeCurDirection(Rnd))
19761             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19762                               Op.getOperand(2), CC, Rnd);
19763         }
19764         //default rounding mode
19765         if(!Cmp.getNode())
19766             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19767                               Op.getOperand(2), CC);
19768
19769       } else {
19770         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19771         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19772                           Op.getOperand(2));
19773       }
19774       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19775                                              DAG.getTargetConstant(0, dl,
19776                                                                    MaskVT),
19777                                              Subtarget, DAG);
19778       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19779                                 DAG.getUNDEF(BitcastVT), CmpMask,
19780                                 DAG.getIntPtrConstant(0, dl));
19781       return DAG.getBitcast(Op.getValueType(), Res);
19782     }
19783     case CMP_MASK_SCALAR_CC: {
19784       SDValue Src1 = Op.getOperand(1);
19785       SDValue Src2 = Op.getOperand(2);
19786       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19787       SDValue Mask = Op.getOperand(4);
19788
19789       SDValue Cmp;
19790       if (IntrData->Opc1 != 0) {
19791         SDValue Rnd = Op.getOperand(5);
19792         if (!isRoundModeCurDirection(Rnd))
19793           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19794       }
19795       //default rounding mode
19796       if(!Cmp.getNode())
19797         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19798
19799       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19800                                              DAG.getTargetConstant(0, dl,
19801                                                                    MVT::i1),
19802                                              Subtarget, DAG);
19803       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19804                          DAG.getIntPtrConstant(0, dl));
19805     }
19806     case COMI: { // Comparison intrinsics
19807       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19808       SDValue LHS = Op.getOperand(1);
19809       SDValue RHS = Op.getOperand(2);
19810       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19811       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19812       SDValue SetCC;
19813       switch (CC) {
19814       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19815         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19816         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19817         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19818         break;
19819       }
19820       case ISD::SETNE: { // (ZF = 1 or PF = 1)
19821         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19822         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19823         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19824         break;
19825       }
19826       case ISD::SETGT: // (CF = 0 and ZF = 0)
19827         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19828         break;
19829       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19830         SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19831         break;
19832       }
19833       case ISD::SETGE: // CF = 0
19834         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19835         break;
19836       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19837         SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19838         break;
19839       default:
19840         llvm_unreachable("Unexpected illegal condition!");
19841       }
19842       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19843     }
19844     case COMI_RM: { // Comparison intrinsics with Sae
19845       SDValue LHS = Op.getOperand(1);
19846       SDValue RHS = Op.getOperand(2);
19847       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19848       SDValue Sae = Op.getOperand(4);
19849
19850       SDValue FCmp;
19851       if (isRoundModeCurDirection(Sae))
19852         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19853                            DAG.getConstant(CondVal, dl, MVT::i8));
19854       else
19855         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19856                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19857       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19858                          DAG.getIntPtrConstant(0, dl));
19859     }
19860     case VSHIFT:
19861       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19862                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
19863                                  DAG);
19864     case COMPRESS_EXPAND_IN_REG: {
19865       SDValue Mask = Op.getOperand(3);
19866       SDValue DataToCompress = Op.getOperand(1);
19867       SDValue PassThru = Op.getOperand(2);
19868       if (isAllOnesConstant(Mask)) // return data as is
19869         return Op.getOperand(1);
19870
19871       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19872                                               DataToCompress),
19873                                   Mask, PassThru, Subtarget, DAG);
19874     }
19875     case BROADCASTM: {
19876       SDValue Mask = Op.getOperand(1);
19877       MVT MaskVT = MVT::getVectorVT(MVT::i1,
19878                                     Mask.getSimpleValueType().getSizeInBits());
19879       Mask = DAG.getBitcast(MaskVT, Mask);
19880       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19881     }
19882     case KUNPCK: {
19883       MVT VT = Op.getSimpleValueType();
19884       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19885
19886       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19887       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19888       // Arguments should be swapped.
19889       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19890                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19891                                 Src2, Src1);
19892       return DAG.getBitcast(VT, Res);
19893     }
19894     case MASK_BINOP: {
19895       MVT VT = Op.getSimpleValueType();
19896       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19897
19898       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19899       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19900       SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19901       return DAG.getBitcast(VT, Res);
19902     }
19903     case FIXUPIMMS:
19904     case FIXUPIMMS_MASKZ:
19905     case FIXUPIMM:
19906     case FIXUPIMM_MASKZ:{
19907       SDValue Src1 = Op.getOperand(1);
19908       SDValue Src2 = Op.getOperand(2);
19909       SDValue Src3 = Op.getOperand(3);
19910       SDValue Imm = Op.getOperand(4);
19911       SDValue Mask = Op.getOperand(5);
19912       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19913                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19914       // We specify 2 possible modes for intrinsics, with/without rounding
19915       // modes.
19916       // First, we check if the intrinsic have rounding mode (7 operands),
19917       // if not, we set rounding mode to "current".
19918       SDValue Rnd;
19919       if (Op.getNumOperands() == 7)
19920         Rnd = Op.getOperand(6);
19921       else
19922         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19923       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19924         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19925                                                 Src1, Src2, Src3, Imm, Rnd),
19926                                     Mask, Passthru, Subtarget, DAG);
19927       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19928         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19929                                        Src1, Src2, Src3, Imm, Rnd),
19930                                     Mask, Passthru, Subtarget, DAG);
19931     }
19932     case CONVERT_TO_MASK: {
19933       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19934       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19935       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19936
19937       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19938                                     Op.getOperand(1));
19939       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19940                                 DAG.getUNDEF(BitcastVT), CvtMask,
19941                                 DAG.getIntPtrConstant(0, dl));
19942       return DAG.getBitcast(Op.getValueType(), Res);
19943     }
19944     case BRCST_SUBVEC_TO_VEC: {
19945       SDValue Src = Op.getOperand(1);
19946       SDValue Passthru = Op.getOperand(2);
19947       SDValue Mask = Op.getOperand(3);
19948       EVT resVT = Passthru.getValueType();
19949       SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19950                                        DAG.getUNDEF(resVT), Src,
19951                                        DAG.getIntPtrConstant(0, dl));
19952       SDValue immVal;
19953       if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19954         immVal = DAG.getConstant(0x44, dl, MVT::i8);
19955       else
19956         immVal = DAG.getConstant(0, dl, MVT::i8);
19957       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19958                                               subVec, subVec, immVal),
19959                                   Mask, Passthru, Subtarget, DAG);
19960     }
19961     case BRCST32x2_TO_VEC: {
19962       SDValue Src = Op.getOperand(1);
19963       SDValue PassThru = Op.getOperand(2);
19964       SDValue Mask = Op.getOperand(3);
19965
19966       assert((VT.getScalarType() == MVT::i32 ||
19967               VT.getScalarType() == MVT::f32) && "Unexpected type!");
19968       //bitcast Src to packed 64
19969       MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19970       MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19971       Src = DAG.getBitcast(BitcastVT, Src);
19972
19973       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19974                                   Mask, PassThru, Subtarget, DAG);
19975     }
19976     default:
19977       break;
19978     }
19979   }
19980
19981   switch (IntNo) {
19982   default: return SDValue();    // Don't custom lower most intrinsics.
19983
19984   case Intrinsic::x86_avx2_permd:
19985   case Intrinsic::x86_avx2_permps:
19986     // Operands intentionally swapped. Mask is last operand to intrinsic,
19987     // but second operand for node/instruction.
19988     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19989                        Op.getOperand(2), Op.getOperand(1));
19990
19991   // ptest and testp intrinsics. The intrinsic these come from are designed to
19992   // return an integer value, not just an instruction so lower it to the ptest
19993   // or testp pattern and a setcc for the result.
19994   case Intrinsic::x86_sse41_ptestz:
19995   case Intrinsic::x86_sse41_ptestc:
19996   case Intrinsic::x86_sse41_ptestnzc:
19997   case Intrinsic::x86_avx_ptestz_256:
19998   case Intrinsic::x86_avx_ptestc_256:
19999   case Intrinsic::x86_avx_ptestnzc_256:
20000   case Intrinsic::x86_avx_vtestz_ps:
20001   case Intrinsic::x86_avx_vtestc_ps:
20002   case Intrinsic::x86_avx_vtestnzc_ps:
20003   case Intrinsic::x86_avx_vtestz_pd:
20004   case Intrinsic::x86_avx_vtestc_pd:
20005   case Intrinsic::x86_avx_vtestnzc_pd:
20006   case Intrinsic::x86_avx_vtestz_ps_256:
20007   case Intrinsic::x86_avx_vtestc_ps_256:
20008   case Intrinsic::x86_avx_vtestnzc_ps_256:
20009   case Intrinsic::x86_avx_vtestz_pd_256:
20010   case Intrinsic::x86_avx_vtestc_pd_256:
20011   case Intrinsic::x86_avx_vtestnzc_pd_256: {
20012     bool IsTestPacked = false;
20013     X86::CondCode X86CC;
20014     switch (IntNo) {
20015     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20016     case Intrinsic::x86_avx_vtestz_ps:
20017     case Intrinsic::x86_avx_vtestz_pd:
20018     case Intrinsic::x86_avx_vtestz_ps_256:
20019     case Intrinsic::x86_avx_vtestz_pd_256:
20020       IsTestPacked = true;
20021       LLVM_FALLTHROUGH;
20022     case Intrinsic::x86_sse41_ptestz:
20023     case Intrinsic::x86_avx_ptestz_256:
20024       // ZF = 1
20025       X86CC = X86::COND_E;
20026       break;
20027     case Intrinsic::x86_avx_vtestc_ps:
20028     case Intrinsic::x86_avx_vtestc_pd:
20029     case Intrinsic::x86_avx_vtestc_ps_256:
20030     case Intrinsic::x86_avx_vtestc_pd_256:
20031       IsTestPacked = true;
20032       LLVM_FALLTHROUGH;
20033     case Intrinsic::x86_sse41_ptestc:
20034     case Intrinsic::x86_avx_ptestc_256:
20035       // CF = 1
20036       X86CC = X86::COND_B;
20037       break;
20038     case Intrinsic::x86_avx_vtestnzc_ps:
20039     case Intrinsic::x86_avx_vtestnzc_pd:
20040     case Intrinsic::x86_avx_vtestnzc_ps_256:
20041     case Intrinsic::x86_avx_vtestnzc_pd_256:
20042       IsTestPacked = true;
20043       LLVM_FALLTHROUGH;
20044     case Intrinsic::x86_sse41_ptestnzc:
20045     case Intrinsic::x86_avx_ptestnzc_256:
20046       // ZF and CF = 0
20047       X86CC = X86::COND_A;
20048       break;
20049     }
20050
20051     SDValue LHS = Op.getOperand(1);
20052     SDValue RHS = Op.getOperand(2);
20053     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20054     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20055     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20056     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20057   }
20058   case Intrinsic::x86_avx512_kortestz_w:
20059   case Intrinsic::x86_avx512_kortestc_w: {
20060     X86::CondCode X86CC =
20061         (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20062     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20063     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20064     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20065     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20066     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20067   }
20068
20069   case Intrinsic::x86_avx512_knot_w: {
20070     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20071     SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20072     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20073     return DAG.getBitcast(MVT::i16, Res);
20074   }
20075
20076   case Intrinsic::x86_avx512_kandn_w: {
20077     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20078     // Invert LHS for the not.
20079     LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20080                       DAG.getConstant(1, dl, MVT::v16i1));
20081     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20082     SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20083     return DAG.getBitcast(MVT::i16, Res);
20084   }
20085
20086   case Intrinsic::x86_avx512_kxnor_w: {
20087     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20088     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20089     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20090     // Invert result for the not.
20091     Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20092                       DAG.getConstant(1, dl, MVT::v16i1));
20093     return DAG.getBitcast(MVT::i16, Res);
20094   }
20095
20096   case Intrinsic::x86_sse42_pcmpistria128:
20097   case Intrinsic::x86_sse42_pcmpestria128:
20098   case Intrinsic::x86_sse42_pcmpistric128:
20099   case Intrinsic::x86_sse42_pcmpestric128:
20100   case Intrinsic::x86_sse42_pcmpistrio128:
20101   case Intrinsic::x86_sse42_pcmpestrio128:
20102   case Intrinsic::x86_sse42_pcmpistris128:
20103   case Intrinsic::x86_sse42_pcmpestris128:
20104   case Intrinsic::x86_sse42_pcmpistriz128:
20105   case Intrinsic::x86_sse42_pcmpestriz128: {
20106     unsigned Opcode;
20107     X86::CondCode X86CC;
20108     switch (IntNo) {
20109     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
20110     case Intrinsic::x86_sse42_pcmpistria128:
20111       Opcode = X86ISD::PCMPISTRI;
20112       X86CC = X86::COND_A;
20113       break;
20114     case Intrinsic::x86_sse42_pcmpestria128:
20115       Opcode = X86ISD::PCMPESTRI;
20116       X86CC = X86::COND_A;
20117       break;
20118     case Intrinsic::x86_sse42_pcmpistric128:
20119       Opcode = X86ISD::PCMPISTRI;
20120       X86CC = X86::COND_B;
20121       break;
20122     case Intrinsic::x86_sse42_pcmpestric128:
20123       Opcode = X86ISD::PCMPESTRI;
20124       X86CC = X86::COND_B;
20125       break;
20126     case Intrinsic::x86_sse42_pcmpistrio128:
20127       Opcode = X86ISD::PCMPISTRI;
20128       X86CC = X86::COND_O;
20129       break;
20130     case Intrinsic::x86_sse42_pcmpestrio128:
20131       Opcode = X86ISD::PCMPESTRI;
20132       X86CC = X86::COND_O;
20133       break;
20134     case Intrinsic::x86_sse42_pcmpistris128:
20135       Opcode = X86ISD::PCMPISTRI;
20136       X86CC = X86::COND_S;
20137       break;
20138     case Intrinsic::x86_sse42_pcmpestris128:
20139       Opcode = X86ISD::PCMPESTRI;
20140       X86CC = X86::COND_S;
20141       break;
20142     case Intrinsic::x86_sse42_pcmpistriz128:
20143       Opcode = X86ISD::PCMPISTRI;
20144       X86CC = X86::COND_E;
20145       break;
20146     case Intrinsic::x86_sse42_pcmpestriz128:
20147       Opcode = X86ISD::PCMPESTRI;
20148       X86CC = X86::COND_E;
20149       break;
20150     }
20151     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20152     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20153     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20154     SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20155     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20156   }
20157
20158   case Intrinsic::x86_sse42_pcmpistri128:
20159   case Intrinsic::x86_sse42_pcmpestri128: {
20160     unsigned Opcode;
20161     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20162       Opcode = X86ISD::PCMPISTRI;
20163     else
20164       Opcode = X86ISD::PCMPESTRI;
20165
20166     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20167     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20168     return DAG.getNode(Opcode, dl, VTs, NewOps);
20169   }
20170
20171   case Intrinsic::eh_sjlj_lsda: {
20172     MachineFunction &MF = DAG.getMachineFunction();
20173     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20174     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20175     auto &Context = MF.getMMI().getContext();
20176     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20177                                             Twine(MF.getFunctionNumber()));
20178     return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20179   }
20180
20181   case Intrinsic::x86_seh_lsda: {
20182     // Compute the symbol for the LSDA. We know it'll get emitted later.
20183     MachineFunction &MF = DAG.getMachineFunction();
20184     SDValue Op1 = Op.getOperand(1);
20185     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20186     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20187         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20188
20189     // Generate a simple absolute symbol reference. This intrinsic is only
20190     // supported on 32-bit Windows, which isn't PIC.
20191     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20192     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20193   }
20194
20195   case Intrinsic::x86_seh_recoverfp: {
20196     SDValue FnOp = Op.getOperand(1);
20197     SDValue IncomingFPOp = Op.getOperand(2);
20198     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20199     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20200     if (!Fn)
20201       report_fatal_error(
20202           "llvm.x86.seh.recoverfp must take a function as the first argument");
20203     return recoverFramePointer(DAG, Fn, IncomingFPOp);
20204   }
20205
20206   case Intrinsic::localaddress: {
20207     // Returns one of the stack, base, or frame pointer registers, depending on
20208     // which is used to reference local variables.
20209     MachineFunction &MF = DAG.getMachineFunction();
20210     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20211     unsigned Reg;
20212     if (RegInfo->hasBasePointer(MF))
20213       Reg = RegInfo->getBaseRegister();
20214     else // This function handles the SP or FP case.
20215       Reg = RegInfo->getPtrSizedFrameRegister(MF);
20216     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20217   }
20218   }
20219 }
20220
20221 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20222                                  SDValue Src, SDValue Mask, SDValue Base,
20223                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
20224                                  const X86Subtarget &Subtarget) {
20225   SDLoc dl(Op);
20226   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20227   // Scale must be constant.
20228   if (!C)
20229     return SDValue();
20230   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20231   EVT MaskVT = Mask.getValueType();
20232   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20233   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20234   SDValue Segment = DAG.getRegister(0, MVT::i32);
20235   // If source is undef or we know it won't be used, use a zero vector
20236   // to break register dependency.
20237   // TODO: use undef instead and let ExecutionDepsFix deal with it?
20238   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20239     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20240   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20241   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20242   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20243   return DAG.getMergeValues(RetOps, dl);
20244 }
20245
20246 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20247                               SDValue Src, SDValue Mask, SDValue Base,
20248                               SDValue Index, SDValue ScaleOp, SDValue Chain,
20249                               const X86Subtarget &Subtarget) {
20250   SDLoc dl(Op);
20251   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20252   // Scale must be constant.
20253   if (!C)
20254     return SDValue();
20255   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20256   MVT MaskVT = MVT::getVectorVT(MVT::i1,
20257                              Index.getSimpleValueType().getVectorNumElements());
20258
20259   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20260   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20261   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20262   SDValue Segment = DAG.getRegister(0, MVT::i32);
20263   // If source is undef or we know it won't be used, use a zero vector
20264   // to break register dependency.
20265   // TODO: use undef instead and let ExecutionDepsFix deal with it?
20266   if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20267     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20268   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20269   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20270   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20271   return DAG.getMergeValues(RetOps, dl);
20272 }
20273
20274 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20275                                SDValue Src, SDValue Mask, SDValue Base,
20276                                SDValue Index, SDValue ScaleOp, SDValue Chain,
20277                                const X86Subtarget &Subtarget) {
20278   SDLoc dl(Op);
20279   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20280   // Scale must be constant.
20281   if (!C)
20282     return SDValue();
20283   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20284   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20285   SDValue Segment = DAG.getRegister(0, MVT::i32);
20286   MVT MaskVT = MVT::getVectorVT(MVT::i1,
20287                              Index.getSimpleValueType().getVectorNumElements());
20288
20289   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20290   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20291   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20292   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20293   return SDValue(Res, 1);
20294 }
20295
20296 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20297                                SDValue Mask, SDValue Base, SDValue Index,
20298                                SDValue ScaleOp, SDValue Chain,
20299                                const X86Subtarget &Subtarget) {
20300   SDLoc dl(Op);
20301   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20302   // Scale must be constant.
20303   if (!C)
20304     return SDValue();
20305   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20306   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20307   SDValue Segment = DAG.getRegister(0, MVT::i32);
20308   MVT MaskVT =
20309     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20310   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20311   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20312   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20313   return SDValue(Res, 0);
20314 }
20315
20316 /// Handles the lowering of builtin intrinsic that return the value
20317 /// of the extended control register.
20318 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20319                                        SelectionDAG &DAG,
20320                                        const X86Subtarget &Subtarget,
20321                                        SmallVectorImpl<SDValue> &Results) {
20322   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20323   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20324   SDValue LO, HI;
20325
20326   // The ECX register is used to select the index of the XCR register to
20327   // return.
20328   SDValue Chain =
20329       DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20330   SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20331   Chain = SDValue(N1, 0);
20332
20333   // Reads the content of XCR and returns it in registers EDX:EAX.
20334   if (Subtarget.is64Bit()) {
20335     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20336     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20337                             LO.getValue(2));
20338   } else {
20339     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20340     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20341                             LO.getValue(2));
20342   }
20343   Chain = HI.getValue(1);
20344
20345   if (Subtarget.is64Bit()) {
20346     // Merge the two 32-bit values into a 64-bit one..
20347     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20348                               DAG.getConstant(32, DL, MVT::i8));
20349     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20350     Results.push_back(Chain);
20351     return;
20352   }
20353
20354   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20355   SDValue Ops[] = { LO, HI };
20356   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20357   Results.push_back(Pair);
20358   Results.push_back(Chain);
20359 }
20360
20361 /// Handles the lowering of builtin intrinsics that read performance monitor
20362 /// counters (x86_rdpmc).
20363 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20364                                       SelectionDAG &DAG,
20365                                       const X86Subtarget &Subtarget,
20366                                       SmallVectorImpl<SDValue> &Results) {
20367   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20368   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20369   SDValue LO, HI;
20370
20371   // The ECX register is used to select the index of the performance counter
20372   // to read.
20373   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20374                                    N->getOperand(2));
20375   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20376
20377   // Reads the content of a 64-bit performance counter and returns it in the
20378   // registers EDX:EAX.
20379   if (Subtarget.is64Bit()) {
20380     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20381     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20382                             LO.getValue(2));
20383   } else {
20384     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20385     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20386                             LO.getValue(2));
20387   }
20388   Chain = HI.getValue(1);
20389
20390   if (Subtarget.is64Bit()) {
20391     // The EAX register is loaded with the low-order 32 bits. The EDX register
20392     // is loaded with the supported high-order bits of the counter.
20393     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20394                               DAG.getConstant(32, DL, MVT::i8));
20395     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20396     Results.push_back(Chain);
20397     return;
20398   }
20399
20400   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20401   SDValue Ops[] = { LO, HI };
20402   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20403   Results.push_back(Pair);
20404   Results.push_back(Chain);
20405 }
20406
20407 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20408 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20409 /// READCYCLECOUNTER nodes.
20410 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20411                                     SelectionDAG &DAG,
20412                                     const X86Subtarget &Subtarget,
20413                                     SmallVectorImpl<SDValue> &Results) {
20414   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20415   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20416   SDValue LO, HI;
20417
20418   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20419   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20420   // and the EAX register is loaded with the low-order 32 bits.
20421   if (Subtarget.is64Bit()) {
20422     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20423     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20424                             LO.getValue(2));
20425   } else {
20426     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20427     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20428                             LO.getValue(2));
20429   }
20430   SDValue Chain = HI.getValue(1);
20431
20432   if (Opcode == X86ISD::RDTSCP_DAG) {
20433     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20434
20435     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20436     // the ECX register. Add 'ecx' explicitly to the chain.
20437     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20438                                      HI.getValue(2));
20439     // Explicitly store the content of ECX at the location passed in input
20440     // to the 'rdtscp' intrinsic.
20441     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20442                          MachinePointerInfo());
20443   }
20444
20445   if (Subtarget.is64Bit()) {
20446     // The EDX register is loaded with the high-order 32 bits of the MSR, and
20447     // the EAX register is loaded with the low-order 32 bits.
20448     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20449                               DAG.getConstant(32, DL, MVT::i8));
20450     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20451     Results.push_back(Chain);
20452     return;
20453   }
20454
20455   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20456   SDValue Ops[] = { LO, HI };
20457   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20458   Results.push_back(Pair);
20459   Results.push_back(Chain);
20460 }
20461
20462 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20463                                      SelectionDAG &DAG) {
20464   SmallVector<SDValue, 2> Results;
20465   SDLoc DL(Op);
20466   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20467                           Results);
20468   return DAG.getMergeValues(Results, DL);
20469 }
20470
20471 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20472   MachineFunction &MF = DAG.getMachineFunction();
20473   SDValue Chain = Op.getOperand(0);
20474   SDValue RegNode = Op.getOperand(2);
20475   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20476   if (!EHInfo)
20477     report_fatal_error("EH registrations only live in functions using WinEH");
20478
20479   // Cast the operand to an alloca, and remember the frame index.
20480   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20481   if (!FINode)
20482     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20483   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20484
20485   // Return the chain operand without making any DAG nodes.
20486   return Chain;
20487 }
20488
20489 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20490   MachineFunction &MF = DAG.getMachineFunction();
20491   SDValue Chain = Op.getOperand(0);
20492   SDValue EHGuard = Op.getOperand(2);
20493   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20494   if (!EHInfo)
20495     report_fatal_error("EHGuard only live in functions using WinEH");
20496
20497   // Cast the operand to an alloca, and remember the frame index.
20498   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20499   if (!FINode)
20500     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20501   EHInfo->EHGuardFrameIndex = FINode->getIndex();
20502
20503   // Return the chain operand without making any DAG nodes.
20504   return Chain;
20505 }
20506
20507 /// Emit Truncating Store with signed or unsigned saturation.
20508 static SDValue
20509 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20510                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20511                 SelectionDAG &DAG) {
20512
20513   SDVTList VTs = DAG.getVTList(MVT::Other);
20514   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20515   SDValue Ops[] = { Chain, Val, Ptr, Undef };
20516   return SignedSat ?
20517     DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20518     DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20519 }
20520
20521 /// Emit Masked Truncating Store with signed or unsigned saturation.
20522 static SDValue
20523 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20524                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20525                       MachineMemOperand *MMO, SelectionDAG &DAG) {
20526
20527   SDVTList VTs = DAG.getVTList(MVT::Other);
20528   SDValue Ops[] = { Chain, Ptr, Mask, Val };
20529   return SignedSat ?
20530     DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20531     DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20532 }
20533
20534 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20535                                       SelectionDAG &DAG) {
20536   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20537
20538   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20539   if (!IntrData) {
20540     switch (IntNo) {
20541     case llvm::Intrinsic::x86_seh_ehregnode:
20542       return MarkEHRegistrationNode(Op, DAG);
20543     case llvm::Intrinsic::x86_seh_ehguard:
20544       return MarkEHGuard(Op, DAG);
20545     case llvm::Intrinsic::x86_flags_read_u32:
20546     case llvm::Intrinsic::x86_flags_read_u64:
20547     case llvm::Intrinsic::x86_flags_write_u32:
20548     case llvm::Intrinsic::x86_flags_write_u64: {
20549       // We need a frame pointer because this will get lowered to a PUSH/POP
20550       // sequence.
20551       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20552       MFI.setHasCopyImplyingStackAdjustment(true);
20553       // Don't do anything here, we will expand these intrinsics out later
20554       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20555       return SDValue();
20556     }
20557     case Intrinsic::x86_lwpins32:
20558     case Intrinsic::x86_lwpins64: {
20559       SDLoc dl(Op);
20560       SDValue Chain = Op->getOperand(0);
20561       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20562       SDValue LwpIns =
20563           DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20564                       Op->getOperand(3), Op->getOperand(4));
20565       SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20566       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20567       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20568                          LwpIns.getValue(1));
20569     }
20570     }
20571     return SDValue();
20572   }
20573
20574   SDLoc dl(Op);
20575   switch(IntrData->Type) {
20576   default: llvm_unreachable("Unknown Intrinsic Type");
20577   case RDSEED:
20578   case RDRAND: {
20579     // Emit the node with the right value type.
20580     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20581     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20582
20583     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20584     // Otherwise return the value from Rand, which is always 0, casted to i32.
20585     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20586                       DAG.getConstant(1, dl, Op->getValueType(1)),
20587                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
20588                       SDValue(Result.getNode(), 1) };
20589     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20590                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
20591                                   Ops);
20592
20593     // Return { result, isValid, chain }.
20594     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20595                        SDValue(Result.getNode(), 2));
20596   }
20597   case GATHER_AVX2: {
20598     SDValue Chain = Op.getOperand(0);
20599     SDValue Src   = Op.getOperand(2);
20600     SDValue Base  = Op.getOperand(3);
20601     SDValue Index = Op.getOperand(4);
20602     SDValue Mask  = Op.getOperand(5);
20603     SDValue Scale = Op.getOperand(6);
20604     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20605                              Scale, Chain, Subtarget);
20606   }
20607   case GATHER: {
20608   //gather(v1, mask, index, base, scale);
20609     SDValue Chain = Op.getOperand(0);
20610     SDValue Src   = Op.getOperand(2);
20611     SDValue Base  = Op.getOperand(3);
20612     SDValue Index = Op.getOperand(4);
20613     SDValue Mask  = Op.getOperand(5);
20614     SDValue Scale = Op.getOperand(6);
20615     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20616                          Chain, Subtarget);
20617   }
20618   case SCATTER: {
20619   //scatter(base, mask, index, v1, scale);
20620     SDValue Chain = Op.getOperand(0);
20621     SDValue Base  = Op.getOperand(2);
20622     SDValue Mask  = Op.getOperand(3);
20623     SDValue Index = Op.getOperand(4);
20624     SDValue Src   = Op.getOperand(5);
20625     SDValue Scale = Op.getOperand(6);
20626     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20627                           Scale, Chain, Subtarget);
20628   }
20629   case PREFETCH: {
20630     SDValue Hint = Op.getOperand(6);
20631     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20632     assert((HintVal == 2 || HintVal == 3) &&
20633            "Wrong prefetch hint in intrinsic: should be 2 or 3");
20634     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20635     SDValue Chain = Op.getOperand(0);
20636     SDValue Mask  = Op.getOperand(2);
20637     SDValue Index = Op.getOperand(3);
20638     SDValue Base  = Op.getOperand(4);
20639     SDValue Scale = Op.getOperand(5);
20640     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20641                            Subtarget);
20642   }
20643   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20644   case RDTSC: {
20645     SmallVector<SDValue, 2> Results;
20646     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20647                             Results);
20648     return DAG.getMergeValues(Results, dl);
20649   }
20650   // Read Performance Monitoring Counters.
20651   case RDPMC: {
20652     SmallVector<SDValue, 2> Results;
20653     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20654     return DAG.getMergeValues(Results, dl);
20655   }
20656   // Get Extended Control Register.
20657   case XGETBV: {
20658     SmallVector<SDValue, 2> Results;
20659     getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20660     return DAG.getMergeValues(Results, dl);
20661   }
20662   // XTEST intrinsics.
20663   case XTEST: {
20664     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20665     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20666
20667     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20668     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20669     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20670                        Ret, SDValue(InTrans.getNode(), 1));
20671   }
20672   // ADC/ADCX/SBB
20673   case ADX: {
20674     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
20675     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
20676     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20677                                 DAG.getConstant(-1, dl, MVT::i8));
20678     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20679                               Op.getOperand(4), GenCF.getValue(1));
20680     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20681                                  Op.getOperand(5), MachinePointerInfo());
20682     SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20683     SDValue Results[] = { SetCC, Store };
20684     return DAG.getMergeValues(Results, dl);
20685   }
20686   case COMPRESS_TO_MEM: {
20687     SDValue Mask = Op.getOperand(4);
20688     SDValue DataToCompress = Op.getOperand(3);
20689     SDValue Addr = Op.getOperand(2);
20690     SDValue Chain = Op.getOperand(0);
20691     MVT VT = DataToCompress.getSimpleValueType();
20692
20693     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20694     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20695
20696     if (isAllOnesConstant(Mask)) // return just a store
20697       return DAG.getStore(Chain, dl, DataToCompress, Addr,
20698                           MemIntr->getMemOperand());
20699
20700     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20701     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20702
20703     return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20704                               MemIntr->getMemOperand(),
20705                               false /* truncating */, true /* compressing */);
20706   }
20707   case TRUNCATE_TO_MEM_VI8:
20708   case TRUNCATE_TO_MEM_VI16:
20709   case TRUNCATE_TO_MEM_VI32: {
20710     SDValue Mask = Op.getOperand(4);
20711     SDValue DataToTruncate = Op.getOperand(3);
20712     SDValue Addr = Op.getOperand(2);
20713     SDValue Chain = Op.getOperand(0);
20714
20715     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20716     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20717
20718     EVT MemVT  = MemIntr->getMemoryVT();
20719
20720     uint16_t TruncationOp = IntrData->Opc0;
20721     switch (TruncationOp) {
20722     case X86ISD::VTRUNC: {
20723       if (isAllOnesConstant(Mask)) // return just a truncate store
20724         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20725                                  MemIntr->getMemOperand());
20726
20727       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20728       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20729
20730       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20731                                 MemIntr->getMemOperand(), true /* truncating */);
20732     }
20733     case X86ISD::VTRUNCUS:
20734     case X86ISD::VTRUNCS: {
20735       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20736       if (isAllOnesConstant(Mask))
20737         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20738                                MemIntr->getMemOperand(), DAG);
20739
20740       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20741       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20742
20743       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20744                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
20745     }
20746     default:
20747       llvm_unreachable("Unsupported truncstore intrinsic");
20748     }
20749   }
20750
20751   case EXPAND_FROM_MEM: {
20752     SDValue Mask = Op.getOperand(4);
20753     SDValue PassThru = Op.getOperand(3);
20754     SDValue Addr = Op.getOperand(2);
20755     SDValue Chain = Op.getOperand(0);
20756     MVT VT = Op.getSimpleValueType();
20757
20758     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20759     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20760
20761     if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20762       return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20763     if (X86::isZeroNode(Mask))
20764       return DAG.getUNDEF(VT);
20765
20766     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20767     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20768     return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20769                              MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20770                              true /* expanding */);
20771   }
20772   }
20773 }
20774
20775 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20776                                            SelectionDAG &DAG) const {
20777   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20778   MFI.setReturnAddressIsTaken(true);
20779
20780   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20781     return SDValue();
20782
20783   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20784   SDLoc dl(Op);
20785   EVT PtrVT = getPointerTy(DAG.getDataLayout());
20786
20787   if (Depth > 0) {
20788     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20789     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20790     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20791     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20792                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20793                        MachinePointerInfo());
20794   }
20795
20796   // Just load the return address.
20797   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20798   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20799                      MachinePointerInfo());
20800 }
20801
20802 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20803                                                  SelectionDAG &DAG) const {
20804   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20805   return getReturnAddressFrameIndex(DAG);
20806 }
20807
20808 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20809   MachineFunction &MF = DAG.getMachineFunction();
20810   MachineFrameInfo &MFI = MF.getFrameInfo();
20811   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20812   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20813   EVT VT = Op.getValueType();
20814
20815   MFI.setFrameAddressIsTaken(true);
20816
20817   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20818     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
20819     // is not possible to crawl up the stack without looking at the unwind codes
20820     // simultaneously.
20821     int FrameAddrIndex = FuncInfo->getFAIndex();
20822     if (!FrameAddrIndex) {
20823       // Set up a frame object for the return address.
20824       unsigned SlotSize = RegInfo->getSlotSize();
20825       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20826           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20827       FuncInfo->setFAIndex(FrameAddrIndex);
20828     }
20829     return DAG.getFrameIndex(FrameAddrIndex, VT);
20830   }
20831
20832   unsigned FrameReg =
20833       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20834   SDLoc dl(Op);  // FIXME probably not meaningful
20835   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20836   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20837           (FrameReg == X86::EBP && VT == MVT::i32)) &&
20838          "Invalid Frame Register!");
20839   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20840   while (Depth--)
20841     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20842                             MachinePointerInfo());
20843   return FrameAddr;
20844 }
20845
20846 // FIXME? Maybe this could be a TableGen attribute on some registers and
20847 // this table could be generated automatically from RegInfo.
20848 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20849                                               SelectionDAG &DAG) const {
20850   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20851   const MachineFunction &MF = DAG.getMachineFunction();
20852
20853   unsigned Reg = StringSwitch<unsigned>(RegName)
20854                        .Case("esp", X86::ESP)
20855                        .Case("rsp", X86::RSP)
20856                        .Case("ebp", X86::EBP)
20857                        .Case("rbp", X86::RBP)
20858                        .Default(0);
20859
20860   if (Reg == X86::EBP || Reg == X86::RBP) {
20861     if (!TFI.hasFP(MF))
20862       report_fatal_error("register " + StringRef(RegName) +
20863                          " is allocatable: function has no frame pointer");
20864 #ifndef NDEBUG
20865     else {
20866       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20867       unsigned FrameReg =
20868           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20869       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20870              "Invalid Frame Register!");
20871     }
20872 #endif
20873   }
20874
20875   if (Reg)
20876     return Reg;
20877
20878   report_fatal_error("Invalid register name global variable");
20879 }
20880
20881 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20882                                                      SelectionDAG &DAG) const {
20883   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20884   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20885 }
20886
20887 unsigned X86TargetLowering::getExceptionPointerRegister(
20888     const Constant *PersonalityFn) const {
20889   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20890     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20891
20892   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20893 }
20894
20895 unsigned X86TargetLowering::getExceptionSelectorRegister(
20896     const Constant *PersonalityFn) const {
20897   // Funclet personalities don't use selectors (the runtime does the selection).
20898   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20899   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20900 }
20901
20902 bool X86TargetLowering::needsFixedCatchObjects() const {
20903   return Subtarget.isTargetWin64();
20904 }
20905
20906 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20907   SDValue Chain     = Op.getOperand(0);
20908   SDValue Offset    = Op.getOperand(1);
20909   SDValue Handler   = Op.getOperand(2);
20910   SDLoc dl      (Op);
20911
20912   EVT PtrVT = getPointerTy(DAG.getDataLayout());
20913   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20914   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20915   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20916           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20917          "Invalid Frame Register!");
20918   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20919   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20920
20921   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20922                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20923                                                        dl));
20924   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20925   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20926   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20927
20928   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20929                      DAG.getRegister(StoreAddrReg, PtrVT));
20930 }
20931
20932 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20933                                                SelectionDAG &DAG) const {
20934   SDLoc DL(Op);
20935   // If the subtarget is not 64bit, we may need the global base reg
20936   // after isel expand pseudo, i.e., after CGBR pass ran.
20937   // Therefore, ask for the GlobalBaseReg now, so that the pass
20938   // inserts the code for us in case we need it.
20939   // Otherwise, we will end up in a situation where we will
20940   // reference a virtual register that is not defined!
20941   if (!Subtarget.is64Bit()) {
20942     const X86InstrInfo *TII = Subtarget.getInstrInfo();
20943     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20944   }
20945   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20946                      DAG.getVTList(MVT::i32, MVT::Other),
20947                      Op.getOperand(0), Op.getOperand(1));
20948 }
20949
20950 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20951                                                 SelectionDAG &DAG) const {
20952   SDLoc DL(Op);
20953   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20954                      Op.getOperand(0), Op.getOperand(1));
20955 }
20956
20957 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20958                                                        SelectionDAG &DAG) const {
20959   SDLoc DL(Op);
20960   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20961                      Op.getOperand(0));
20962 }
20963
20964 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20965   return Op.getOperand(0);
20966 }
20967
20968 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20969                                                 SelectionDAG &DAG) const {
20970   SDValue Root = Op.getOperand(0);
20971   SDValue Trmp = Op.getOperand(1); // trampoline
20972   SDValue FPtr = Op.getOperand(2); // nested function
20973   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20974   SDLoc dl (Op);
20975
20976   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20977   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20978
20979   if (Subtarget.is64Bit()) {
20980     SDValue OutChains[6];
20981
20982     // Large code-model.
20983     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
20984     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20985
20986     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20987     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20988
20989     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20990
20991     // Load the pointer to the nested function into R11.
20992     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20993     SDValue Addr = Trmp;
20994     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20995                                 Addr, MachinePointerInfo(TrmpAddr));
20996
20997     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20998                        DAG.getConstant(2, dl, MVT::i64));
20999     OutChains[1] =
21000         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21001                      /* Alignment = */ 2);
21002
21003     // Load the 'nest' parameter value into R10.
21004     // R10 is specified in X86CallingConv.td
21005     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21006     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21007                        DAG.getConstant(10, dl, MVT::i64));
21008     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21009                                 Addr, MachinePointerInfo(TrmpAddr, 10));
21010
21011     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21012                        DAG.getConstant(12, dl, MVT::i64));
21013     OutChains[3] =
21014         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21015                      /* Alignment = */ 2);
21016
21017     // Jump to the nested function.
21018     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21019     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21020                        DAG.getConstant(20, dl, MVT::i64));
21021     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21022                                 Addr, MachinePointerInfo(TrmpAddr, 20));
21023
21024     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21025     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21026                        DAG.getConstant(22, dl, MVT::i64));
21027     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21028                                 Addr, MachinePointerInfo(TrmpAddr, 22));
21029
21030     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21031   } else {
21032     const Function *Func =
21033       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21034     CallingConv::ID CC = Func->getCallingConv();
21035     unsigned NestReg;
21036
21037     switch (CC) {
21038     default:
21039       llvm_unreachable("Unsupported calling convention");
21040     case CallingConv::C:
21041     case CallingConv::X86_StdCall: {
21042       // Pass 'nest' parameter in ECX.
21043       // Must be kept in sync with X86CallingConv.td
21044       NestReg = X86::ECX;
21045
21046       // Check that ECX wasn't needed by an 'inreg' parameter.
21047       FunctionType *FTy = Func->getFunctionType();
21048       const AttributeList &Attrs = Func->getAttributes();
21049
21050       if (!Attrs.isEmpty() && !Func->isVarArg()) {
21051         unsigned InRegCount = 0;
21052         unsigned Idx = 1;
21053
21054         for (FunctionType::param_iterator I = FTy->param_begin(),
21055              E = FTy->param_end(); I != E; ++I, ++Idx)
21056           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21057             auto &DL = DAG.getDataLayout();
21058             // FIXME: should only count parameters that are lowered to integers.
21059             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21060           }
21061
21062         if (InRegCount > 2) {
21063           report_fatal_error("Nest register in use - reduce number of inreg"
21064                              " parameters!");
21065         }
21066       }
21067       break;
21068     }
21069     case CallingConv::X86_FastCall:
21070     case CallingConv::X86_ThisCall:
21071     case CallingConv::Fast:
21072       // Pass 'nest' parameter in EAX.
21073       // Must be kept in sync with X86CallingConv.td
21074       NestReg = X86::EAX;
21075       break;
21076     }
21077
21078     SDValue OutChains[4];
21079     SDValue Addr, Disp;
21080
21081     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21082                        DAG.getConstant(10, dl, MVT::i32));
21083     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21084
21085     // This is storing the opcode for MOV32ri.
21086     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21087     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21088     OutChains[0] =
21089         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21090                      Trmp, MachinePointerInfo(TrmpAddr));
21091
21092     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21093                        DAG.getConstant(1, dl, MVT::i32));
21094     OutChains[1] =
21095         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21096                      /* Alignment = */ 1);
21097
21098     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21099     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21100                        DAG.getConstant(5, dl, MVT::i32));
21101     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21102                                 Addr, MachinePointerInfo(TrmpAddr, 5),
21103                                 /* Alignment = */ 1);
21104
21105     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21106                        DAG.getConstant(6, dl, MVT::i32));
21107     OutChains[3] =
21108         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21109                      /* Alignment = */ 1);
21110
21111     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21112   }
21113 }
21114
21115 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21116                                             SelectionDAG &DAG) const {
21117   /*
21118    The rounding mode is in bits 11:10 of FPSR, and has the following
21119    settings:
21120      00 Round to nearest
21121      01 Round to -inf
21122      10 Round to +inf
21123      11 Round to 0
21124
21125   FLT_ROUNDS, on the other hand, expects the following:
21126     -1 Undefined
21127      0 Round to 0
21128      1 Round to nearest
21129      2 Round to +inf
21130      3 Round to -inf
21131
21132   To perform the conversion, we do:
21133     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21134   */
21135
21136   MachineFunction &MF = DAG.getMachineFunction();
21137   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21138   unsigned StackAlignment = TFI.getStackAlignment();
21139   MVT VT = Op.getSimpleValueType();
21140   SDLoc DL(Op);
21141
21142   // Save FP Control Word to stack slot
21143   int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21144   SDValue StackSlot =
21145       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21146
21147   MachineMemOperand *MMO =
21148       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21149                               MachineMemOperand::MOStore, 2, 2);
21150
21151   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21152   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21153                                           DAG.getVTList(MVT::Other),
21154                                           Ops, MVT::i16, MMO);
21155
21156   // Load FP Control Word from stack slot
21157   SDValue CWD =
21158       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21159
21160   // Transform as necessary
21161   SDValue CWD1 =
21162     DAG.getNode(ISD::SRL, DL, MVT::i16,
21163                 DAG.getNode(ISD::AND, DL, MVT::i16,
21164                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21165                 DAG.getConstant(11, DL, MVT::i8));
21166   SDValue CWD2 =
21167     DAG.getNode(ISD::SRL, DL, MVT::i16,
21168                 DAG.getNode(ISD::AND, DL, MVT::i16,
21169                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21170                 DAG.getConstant(9, DL, MVT::i8));
21171
21172   SDValue RetVal =
21173     DAG.getNode(ISD::AND, DL, MVT::i16,
21174                 DAG.getNode(ISD::ADD, DL, MVT::i16,
21175                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21176                             DAG.getConstant(1, DL, MVT::i16)),
21177                 DAG.getConstant(3, DL, MVT::i16));
21178
21179   return DAG.getNode((VT.getSizeInBits() < 16 ?
21180                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21181 }
21182
21183 // Split an unary integer op into 2 half sized ops.
21184 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21185   MVT VT = Op.getSimpleValueType();
21186   unsigned NumElems = VT.getVectorNumElements();
21187   unsigned SizeInBits = VT.getSizeInBits();
21188
21189   // Extract the Lo/Hi vectors
21190   SDLoc dl(Op);
21191   SDValue Src = Op.getOperand(0);
21192   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21193   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21194
21195   MVT EltVT = VT.getVectorElementType();
21196   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21197   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21198                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21199                      DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21200 }
21201
21202 // Decompose 256-bit ops into smaller 128-bit ops.
21203 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21204   assert(Op.getSimpleValueType().is256BitVector() &&
21205          Op.getSimpleValueType().isInteger() &&
21206          "Only handle AVX 256-bit vector integer operation");
21207   return LowerVectorIntUnary(Op, DAG);
21208 }
21209
21210 // Decompose 512-bit ops into smaller 256-bit ops.
21211 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21212   assert(Op.getSimpleValueType().is512BitVector() &&
21213          Op.getSimpleValueType().isInteger() &&
21214          "Only handle AVX 512-bit vector integer operation");
21215   return LowerVectorIntUnary(Op, DAG);
21216 }
21217
21218 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21219 //
21220 // i8/i16 vector implemented using dword LZCNT vector instruction
21221 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21222 // split the vector, perform operation on it's Lo a Hi part and
21223 // concatenate the results.
21224 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21225   assert(Op.getOpcode() == ISD::CTLZ);
21226   SDLoc dl(Op);
21227   MVT VT = Op.getSimpleValueType();
21228   MVT EltVT = VT.getVectorElementType();
21229   unsigned NumElems = VT.getVectorNumElements();
21230
21231   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21232           "Unsupported element type");
21233
21234   // Split vector, it's Lo and Hi parts will be handled in next iteration.
21235   if (16 < NumElems)
21236     return LowerVectorIntUnary(Op, DAG);
21237
21238   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21239   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21240           "Unsupported value type for operation");
21241
21242   // Use native supported vector instruction vplzcntd.
21243   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21244   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21245   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21246   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21247
21248   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21249 }
21250
21251 // Lower CTLZ using a PSHUFB lookup table implementation.
21252 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21253                                        const X86Subtarget &Subtarget,
21254                                        SelectionDAG &DAG) {
21255   MVT VT = Op.getSimpleValueType();
21256   int NumElts = VT.getVectorNumElements();
21257   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21258   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21259
21260   // Per-nibble leading zero PSHUFB lookup table.
21261   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21262                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21263                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21264                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21265
21266   SmallVector<SDValue, 64> LUTVec;
21267   for (int i = 0; i < NumBytes; ++i)
21268     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21269   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21270
21271   // Begin by bitcasting the input to byte vector, then split those bytes
21272   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21273   // If the hi input nibble is zero then we add both results together, otherwise
21274   // we just take the hi result (by masking the lo result to zero before the
21275   // add).
21276   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21277   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21278
21279   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21280   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21281   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21282   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21283   SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21284
21285   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21286   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21287   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21288   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21289
21290   // Merge result back from vXi8 back to VT, working on the lo/hi halves
21291   // of the current vector width in the same way we did for the nibbles.
21292   // If the upper half of the input element is zero then add the halves'
21293   // leading zero counts together, otherwise just use the upper half's.
21294   // Double the width of the result until we are at target width.
21295   while (CurrVT != VT) {
21296     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21297     int CurrNumElts = CurrVT.getVectorNumElements();
21298     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21299     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21300     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21301
21302     // Check if the upper half of the input element is zero.
21303     SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21304                                DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21305     HiZ = DAG.getBitcast(NextVT, HiZ);
21306
21307     // Move the upper/lower halves to the lower bits as we'll be extending to
21308     // NextVT. Mask the lower result to zero if HiZ is true and add the results
21309     // together.
21310     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21311     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21312     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21313     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21314     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21315     CurrVT = NextVT;
21316   }
21317
21318   return Res;
21319 }
21320
21321 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21322                                const X86Subtarget &Subtarget,
21323                                SelectionDAG &DAG) {
21324   MVT VT = Op.getSimpleValueType();
21325
21326   if (Subtarget.hasCDI())
21327     return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21328
21329   // Decompose 256-bit ops into smaller 128-bit ops.
21330   if (VT.is256BitVector() && !Subtarget.hasInt256())
21331     return Lower256IntUnary(Op, DAG);
21332
21333   // Decompose 512-bit ops into smaller 256-bit ops.
21334   if (VT.is512BitVector() && !Subtarget.hasBWI())
21335     return Lower512IntUnary(Op, DAG);
21336
21337   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21338   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21339 }
21340
21341 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21342                          SelectionDAG &DAG) {
21343   MVT VT = Op.getSimpleValueType();
21344   MVT OpVT = VT;
21345   unsigned NumBits = VT.getSizeInBits();
21346   SDLoc dl(Op);
21347   unsigned Opc = Op.getOpcode();
21348
21349   if (VT.isVector())
21350     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21351
21352   Op = Op.getOperand(0);
21353   if (VT == MVT::i8) {
21354     // Zero extend to i32 since there is not an i8 bsr.
21355     OpVT = MVT::i32;
21356     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21357   }
21358
21359   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21360   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21361   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21362
21363   if (Opc == ISD::CTLZ) {
21364     // If src is zero (i.e. bsr sets ZF), returns NumBits.
21365     SDValue Ops[] = {
21366       Op,
21367       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21368       DAG.getConstant(X86::COND_E, dl, MVT::i8),
21369       Op.getValue(1)
21370     };
21371     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21372   }
21373
21374   // Finally xor with NumBits-1.
21375   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21376                    DAG.getConstant(NumBits - 1, dl, OpVT));
21377
21378   if (VT == MVT::i8)
21379     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21380   return Op;
21381 }
21382
21383 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21384   MVT VT = Op.getSimpleValueType();
21385   unsigned NumBits = VT.getScalarSizeInBits();
21386   SDLoc dl(Op);
21387
21388   if (VT.isVector()) {
21389     SDValue N0 = Op.getOperand(0);
21390     SDValue Zero = DAG.getConstant(0, dl, VT);
21391
21392     // lsb(x) = (x & -x)
21393     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21394                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21395
21396     // cttz_undef(x) = (width - 1) - ctlz(lsb)
21397     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21398       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21399       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21400                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21401     }
21402
21403     // cttz(x) = ctpop(lsb - 1)
21404     SDValue One = DAG.getConstant(1, dl, VT);
21405     return DAG.getNode(ISD::CTPOP, dl, VT,
21406                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21407   }
21408
21409   assert(Op.getOpcode() == ISD::CTTZ &&
21410          "Only scalar CTTZ requires custom lowering");
21411
21412   // Issue a bsf (scan bits forward) which also sets EFLAGS.
21413   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21414   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21415
21416   // If src is zero (i.e. bsf sets ZF), returns NumBits.
21417   SDValue Ops[] = {
21418     Op,
21419     DAG.getConstant(NumBits, dl, VT),
21420     DAG.getConstant(X86::COND_E, dl, MVT::i8),
21421     Op.getValue(1)
21422   };
21423   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21424 }
21425
21426 /// Break a 256-bit integer operation into two new 128-bit ones and then
21427 /// concatenate the result back.
21428 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21429   MVT VT = Op.getSimpleValueType();
21430
21431   assert(VT.is256BitVector() && VT.isInteger() &&
21432          "Unsupported value type for operation");
21433
21434   unsigned NumElems = VT.getVectorNumElements();
21435   SDLoc dl(Op);
21436
21437   // Extract the LHS vectors
21438   SDValue LHS = Op.getOperand(0);
21439   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21440   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21441
21442   // Extract the RHS vectors
21443   SDValue RHS = Op.getOperand(1);
21444   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21445   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21446
21447   MVT EltVT = VT.getVectorElementType();
21448   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21449
21450   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21451                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21452                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21453 }
21454
21455 /// Break a 512-bit integer operation into two new 256-bit ones and then
21456 /// concatenate the result back.
21457 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21458   MVT VT = Op.getSimpleValueType();
21459
21460   assert(VT.is512BitVector() && VT.isInteger() &&
21461          "Unsupported value type for operation");
21462
21463   unsigned NumElems = VT.getVectorNumElements();
21464   SDLoc dl(Op);
21465
21466   // Extract the LHS vectors
21467   SDValue LHS = Op.getOperand(0);
21468   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21469   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21470
21471   // Extract the RHS vectors
21472   SDValue RHS = Op.getOperand(1);
21473   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21474   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21475
21476   MVT EltVT = VT.getVectorElementType();
21477   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21478
21479   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21480                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21481                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21482 }
21483
21484 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21485   MVT VT = Op.getSimpleValueType();
21486   if (VT.getScalarType() == MVT::i1)
21487     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21488                        Op.getOperand(0), Op.getOperand(1));
21489   assert(Op.getSimpleValueType().is256BitVector() &&
21490          Op.getSimpleValueType().isInteger() &&
21491          "Only handle AVX 256-bit vector integer operation");
21492   return Lower256IntArith(Op, DAG);
21493 }
21494
21495 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21496   assert(Op.getSimpleValueType().is256BitVector() &&
21497          Op.getSimpleValueType().isInteger() &&
21498          "Only handle AVX 256-bit vector integer operation");
21499   return Lower256IntUnary(Op, DAG);
21500 }
21501
21502 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21503   assert(Op.getSimpleValueType().is256BitVector() &&
21504          Op.getSimpleValueType().isInteger() &&
21505          "Only handle AVX 256-bit vector integer operation");
21506   return Lower256IntArith(Op, DAG);
21507 }
21508
21509 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21510                         SelectionDAG &DAG) {
21511   SDLoc dl(Op);
21512   MVT VT = Op.getSimpleValueType();
21513
21514   if (VT.getScalarType() == MVT::i1)
21515     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21516
21517   // Decompose 256-bit ops into smaller 128-bit ops.
21518   if (VT.is256BitVector() && !Subtarget.hasInt256())
21519     return Lower256IntArith(Op, DAG);
21520
21521   SDValue A = Op.getOperand(0);
21522   SDValue B = Op.getOperand(1);
21523
21524   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21525   // vector pairs, multiply and truncate.
21526   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21527     if (Subtarget.hasInt256()) {
21528       // For 512-bit vectors, split into 256-bit vectors to allow the
21529       // sign-extension to occur.
21530       if (VT == MVT::v64i8)
21531         return Lower512IntArith(Op, DAG);
21532
21533       // For 256-bit vectors, split into 128-bit vectors to allow the
21534       // sign-extension to occur. We don't need this on AVX512BW as we can
21535       // safely sign-extend to v32i16.
21536       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21537         return Lower256IntArith(Op, DAG);
21538
21539       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21540       return DAG.getNode(
21541           ISD::TRUNCATE, dl, VT,
21542           DAG.getNode(ISD::MUL, dl, ExVT,
21543                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21544                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21545     }
21546
21547     assert(VT == MVT::v16i8 &&
21548            "Pre-AVX2 support only supports v16i8 multiplication");
21549     MVT ExVT = MVT::v8i16;
21550
21551     // Extract the lo parts and sign extend to i16
21552     SDValue ALo, BLo;
21553     if (Subtarget.hasSSE41()) {
21554       ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21555       BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21556     } else {
21557       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21558                               -1, 4, -1, 5, -1, 6, -1, 7};
21559       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21560       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21561       ALo = DAG.getBitcast(ExVT, ALo);
21562       BLo = DAG.getBitcast(ExVT, BLo);
21563       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21564       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21565     }
21566
21567     // Extract the hi parts and sign extend to i16
21568     SDValue AHi, BHi;
21569     if (Subtarget.hasSSE41()) {
21570       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21571                               -1, -1, -1, -1, -1, -1, -1, -1};
21572       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21573       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21574       AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21575       BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21576     } else {
21577       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21578                               -1, 12, -1, 13, -1, 14, -1, 15};
21579       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21580       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21581       AHi = DAG.getBitcast(ExVT, AHi);
21582       BHi = DAG.getBitcast(ExVT, BHi);
21583       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21584       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21585     }
21586
21587     // Multiply, mask the lower 8bits of the lo/hi results and pack
21588     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21589     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21590     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21591     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21592     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21593   }
21594
21595   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21596   if (VT == MVT::v4i32) {
21597     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21598            "Should not custom lower when pmuldq is available!");
21599
21600     // Extract the odd parts.
21601     static const int UnpackMask[] = { 1, -1, 3, -1 };
21602     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21603     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21604
21605     // Multiply the even parts.
21606     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21607     // Now multiply odd parts.
21608     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21609
21610     Evens = DAG.getBitcast(VT, Evens);
21611     Odds = DAG.getBitcast(VT, Odds);
21612
21613     // Merge the two vectors back together with a shuffle. This expands into 2
21614     // shuffles.
21615     static const int ShufMask[] = { 0, 4, 2, 6 };
21616     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21617   }
21618
21619   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21620          "Only know how to lower V2I64/V4I64/V8I64 multiply");
21621
21622   // 32-bit vector types used for MULDQ/MULUDQ.
21623   MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21624
21625   // MULDQ returns the 64-bit result of the signed multiplication of the lower
21626   // 32-bits. We can lower with this if the sign bits stretch that far.
21627   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21628       DAG.ComputeNumSignBits(B) > 32) {
21629     return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21630                        DAG.getBitcast(MulVT, B));
21631   }
21632
21633   //  Ahi = psrlqi(a, 32);
21634   //  Bhi = psrlqi(b, 32);
21635   //
21636   //  AloBlo = pmuludq(a, b);
21637   //  AloBhi = pmuludq(a, Bhi);
21638   //  AhiBlo = pmuludq(Ahi, b);
21639   //
21640   //  Hi = psllqi(AloBhi + AhiBlo, 32);
21641   //  return AloBlo + Hi;
21642   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21643   bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21644   bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21645
21646   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21647   bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21648   bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21649
21650   // Bit cast to 32-bit vectors for MULUDQ.
21651   SDValue Alo = DAG.getBitcast(MulVT, A);
21652   SDValue Blo = DAG.getBitcast(MulVT, B);
21653
21654   SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21655
21656   // Only multiply lo/hi halves that aren't known to be zero.
21657   SDValue AloBlo = Zero;
21658   if (!ALoIsZero && !BLoIsZero)
21659     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21660
21661   SDValue AloBhi = Zero;
21662   if (!ALoIsZero && !BHiIsZero) {
21663     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21664     Bhi = DAG.getBitcast(MulVT, Bhi);
21665     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21666   }
21667
21668   SDValue AhiBlo = Zero;
21669   if (!AHiIsZero && !BLoIsZero) {
21670     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21671     Ahi = DAG.getBitcast(MulVT, Ahi);
21672     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21673   }
21674
21675   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21676   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21677
21678   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21679 }
21680
21681 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21682                          SelectionDAG &DAG) {
21683   SDLoc dl(Op);
21684   MVT VT = Op.getSimpleValueType();
21685
21686   // Decompose 256-bit ops into smaller 128-bit ops.
21687   if (VT.is256BitVector() && !Subtarget.hasInt256())
21688     return Lower256IntArith(Op, DAG);
21689
21690   // Only i8 vectors should need custom lowering after this.
21691   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21692          "Unsupported vector type");
21693
21694   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21695   // logical shift down the upper half and pack back to i8.
21696   SDValue A = Op.getOperand(0);
21697   SDValue B = Op.getOperand(1);
21698
21699   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21700   // and then ashr/lshr the upper bits down to the lower bits before multiply.
21701   unsigned Opcode = Op.getOpcode();
21702   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21703   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21704
21705   // AVX2 implementations - extend xmm subvectors to ymm.
21706   if (Subtarget.hasInt256()) {
21707     SDValue Lo = DAG.getIntPtrConstant(0, dl);
21708     SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21709
21710     if (VT == MVT::v32i8) {
21711       SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21712       SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21713       SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21714       SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21715       ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21716       BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21717       AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21718       BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21719       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21720                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21721                        DAG.getConstant(8, dl, MVT::v16i16));
21722       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21723                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21724                        DAG.getConstant(8, dl, MVT::v16i16));
21725       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21726       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21727       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
21728                             16, 17, 18, 19, 20, 21, 22, 23};
21729       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21730                             24, 25, 26, 27, 28, 29, 30, 31};
21731       return DAG.getNode(X86ISD::PACKUS, dl, VT,
21732                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21733                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21734     }
21735
21736     SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21737     SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21738     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21739     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21740                                DAG.getConstant(8, dl, MVT::v16i16));
21741     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21742     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21743     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21744   }
21745
21746   assert(VT == MVT::v16i8 &&
21747          "Pre-AVX2 support only supports v16i8 multiplication");
21748   MVT ExVT = MVT::v8i16;
21749
21750   // Extract the lo parts and zero/sign extend to i16.
21751   SDValue ALo, BLo;
21752   if (Subtarget.hasSSE41()) {
21753     ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21754     BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21755   } else {
21756     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21757                             -1, 4, -1, 5, -1, 6, -1, 7};
21758     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21759     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21760     ALo = DAG.getBitcast(ExVT, ALo);
21761     BLo = DAG.getBitcast(ExVT, BLo);
21762     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21763     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21764   }
21765
21766   // Extract the hi parts and zero/sign extend to i16.
21767   SDValue AHi, BHi;
21768   if (Subtarget.hasSSE41()) {
21769     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21770                             -1, -1, -1, -1, -1, -1, -1, -1};
21771     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21772     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21773     AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21774     BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21775   } else {
21776     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21777                             -1, 12, -1, 13, -1, 14, -1, 15};
21778     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21779     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21780     AHi = DAG.getBitcast(ExVT, AHi);
21781     BHi = DAG.getBitcast(ExVT, BHi);
21782     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21783     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21784   }
21785
21786   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21787   // pack back to v16i8.
21788   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21789   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21790   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21791   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21792   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21793 }
21794
21795 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21796   assert(Subtarget.isTargetWin64() && "Unexpected target");
21797   EVT VT = Op.getValueType();
21798   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21799          "Unexpected return type for lowering");
21800
21801   RTLIB::Libcall LC;
21802   bool isSigned;
21803   switch (Op->getOpcode()) {
21804   default: llvm_unreachable("Unexpected request for libcall!");
21805   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
21806   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
21807   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
21808   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
21809   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
21810   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21811   }
21812
21813   SDLoc dl(Op);
21814   SDValue InChain = DAG.getEntryNode();
21815
21816   TargetLowering::ArgListTy Args;
21817   TargetLowering::ArgListEntry Entry;
21818   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21819     EVT ArgVT = Op->getOperand(i).getValueType();
21820     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21821            "Unexpected argument type for lowering");
21822     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21823     Entry.Node = StackPtr;
21824     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21825                            MachinePointerInfo(), /* Alignment = */ 16);
21826     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21827     Entry.Ty = PointerType::get(ArgTy,0);
21828     Entry.IsSExt = false;
21829     Entry.IsZExt = false;
21830     Args.push_back(Entry);
21831   }
21832
21833   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21834                                          getPointerTy(DAG.getDataLayout()));
21835
21836   TargetLowering::CallLoweringInfo CLI(DAG);
21837   CLI.setDebugLoc(dl)
21838       .setChain(InChain)
21839       .setLibCallee(
21840           getLibcallCallingConv(LC),
21841           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21842           std::move(Args))
21843       .setInRegister()
21844       .setSExtResult(isSigned)
21845       .setZExtResult(!isSigned);
21846
21847   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21848   return DAG.getBitcast(VT, CallInfo.first);
21849 }
21850
21851 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21852                              SelectionDAG &DAG) {
21853   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21854   MVT VT = Op0.getSimpleValueType();
21855   SDLoc dl(Op);
21856
21857   // Decompose 256-bit ops into smaller 128-bit ops.
21858   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21859     unsigned Opcode = Op.getOpcode();
21860     unsigned NumElems = VT.getVectorNumElements();
21861     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21862     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21863     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21864     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21865     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21866     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21867     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21868     SDValue Ops[] = {
21869       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21870       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21871     };
21872     return DAG.getMergeValues(Ops, dl);
21873   }
21874
21875   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21876          (VT == MVT::v8i32 && Subtarget.hasInt256()));
21877
21878   // PMULxD operations multiply each even value (starting at 0) of LHS with
21879   // the related value of RHS and produce a widen result.
21880   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21881   // => <2 x i64> <ae|cg>
21882   //
21883   // In other word, to have all the results, we need to perform two PMULxD:
21884   // 1. one with the even values.
21885   // 2. one with the odd values.
21886   // To achieve #2, with need to place the odd values at an even position.
21887   //
21888   // Place the odd value at an even position (basically, shift all values 1
21889   // step to the left):
21890   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21891   // <a|b|c|d> => <b|undef|d|undef>
21892   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21893                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21894   // <e|f|g|h> => <f|undef|h|undef>
21895   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21896                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21897
21898   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21899   // ints.
21900   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21901   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21902   unsigned Opcode =
21903       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21904   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21905   // => <2 x i64> <ae|cg>
21906   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21907   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21908   // => <2 x i64> <bf|dh>
21909   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21910
21911   // Shuffle it back into the right order.
21912   SDValue Highs, Lows;
21913   if (VT == MVT::v8i32) {
21914     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21915     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21916     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21917     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21918   } else {
21919     const int HighMask[] = {1, 5, 3, 7};
21920     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21921     const int LowMask[] = {0, 4, 2, 6};
21922     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21923   }
21924
21925   // If we have a signed multiply but no PMULDQ fix up the high parts of a
21926   // unsigned multiply.
21927   if (IsSigned && !Subtarget.hasSSE41()) {
21928     SDValue ShAmt = DAG.getConstant(
21929         31, dl,
21930         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21931     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21932                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21933     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21934                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21935
21936     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21937     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21938   }
21939
21940   // The first result of MUL_LOHI is actually the low value, followed by the
21941   // high value.
21942   SDValue Ops[] = {Lows, Highs};
21943   return DAG.getMergeValues(Ops, dl);
21944 }
21945
21946 // Return true if the required (according to Opcode) shift-imm form is natively
21947 // supported by the Subtarget
21948 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21949                                         unsigned Opcode) {
21950   if (VT.getScalarSizeInBits() < 16)
21951     return false;
21952
21953   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21954       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21955     return true;
21956
21957   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21958                 (VT.is256BitVector() && Subtarget.hasInt256());
21959
21960   bool AShift = LShift && (Subtarget.hasAVX512() ||
21961                            (VT != MVT::v2i64 && VT != MVT::v4i64));
21962   return (Opcode == ISD::SRA) ? AShift : LShift;
21963 }
21964
21965 // The shift amount is a variable, but it is the same for all vector lanes.
21966 // These instructions are defined together with shift-immediate.
21967 static
21968 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21969                                       unsigned Opcode) {
21970   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21971 }
21972
21973 // Return true if the required (according to Opcode) variable-shift form is
21974 // natively supported by the Subtarget
21975 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21976                                     unsigned Opcode) {
21977
21978   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21979     return false;
21980
21981   // vXi16 supported only on AVX-512, BWI
21982   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21983     return false;
21984
21985   if (Subtarget.hasAVX512())
21986     return true;
21987
21988   bool LShift = VT.is128BitVector() || VT.is256BitVector();
21989   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
21990   return (Opcode == ISD::SRA) ? AShift : LShift;
21991 }
21992
21993 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21994                                          const X86Subtarget &Subtarget) {
21995   MVT VT = Op.getSimpleValueType();
21996   SDLoc dl(Op);
21997   SDValue R = Op.getOperand(0);
21998   SDValue Amt = Op.getOperand(1);
21999
22000   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22001     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22002
22003   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22004     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22005     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22006     SDValue Ex = DAG.getBitcast(ExVT, R);
22007
22008     // ashr(R, 63) === cmp_slt(R, 0)
22009     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22010       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22011              "Unsupported PCMPGT op");
22012       return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22013                          getZeroVector(VT, Subtarget, DAG, dl), R);
22014     }
22015
22016     if (ShiftAmt >= 32) {
22017       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22018       SDValue Upper =
22019           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22020       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22021                                                  ShiftAmt - 32, DAG);
22022       if (VT == MVT::v2i64)
22023         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22024       if (VT == MVT::v4i64)
22025         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22026                                   {9, 1, 11, 3, 13, 5, 15, 7});
22027     } else {
22028       // SRA upper i32, SHL whole i64 and select lower i32.
22029       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22030                                                  ShiftAmt, DAG);
22031       SDValue Lower =
22032           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22033       Lower = DAG.getBitcast(ExVT, Lower);
22034       if (VT == MVT::v2i64)
22035         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22036       if (VT == MVT::v4i64)
22037         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22038                                   {8, 1, 10, 3, 12, 5, 14, 7});
22039     }
22040     return DAG.getBitcast(VT, Ex);
22041   };
22042
22043   // Optimize shl/srl/sra with constant shift amount.
22044   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22045     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22046       uint64_t ShiftAmt = ShiftConst->getZExtValue();
22047
22048       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22049         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22050
22051       // i64 SRA needs to be performed as partial shifts.
22052       if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22053            (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22054           Op.getOpcode() == ISD::SRA)
22055         return ArithmeticShiftRight64(ShiftAmt);
22056
22057       if (VT == MVT::v16i8 ||
22058           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22059           VT == MVT::v64i8) {
22060         unsigned NumElts = VT.getVectorNumElements();
22061         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22062
22063         // Simple i8 add case
22064         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22065           return DAG.getNode(ISD::ADD, dl, VT, R, R);
22066
22067         // ashr(R, 7)  === cmp_slt(R, 0)
22068         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22069           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22070           if (VT.is512BitVector()) {
22071             assert(VT == MVT::v64i8 && "Unexpected element type!");
22072             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22073             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22074           }
22075           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22076         }
22077
22078         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22079         if (VT == MVT::v16i8 && Subtarget.hasXOP())
22080           return SDValue();
22081
22082         if (Op.getOpcode() == ISD::SHL) {
22083           // Make a large shift.
22084           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22085                                                    R, ShiftAmt, DAG);
22086           SHL = DAG.getBitcast(VT, SHL);
22087           // Zero out the rightmost bits.
22088           return DAG.getNode(ISD::AND, dl, VT, SHL,
22089                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22090         }
22091         if (Op.getOpcode() == ISD::SRL) {
22092           // Make a large shift.
22093           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22094                                                    R, ShiftAmt, DAG);
22095           SRL = DAG.getBitcast(VT, SRL);
22096           // Zero out the leftmost bits.
22097           return DAG.getNode(ISD::AND, dl, VT, SRL,
22098                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22099         }
22100         if (Op.getOpcode() == ISD::SRA) {
22101           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22102           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22103
22104           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22105           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22106           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22107           return Res;
22108         }
22109         llvm_unreachable("Unknown shift opcode.");
22110       }
22111     }
22112   }
22113
22114   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22115   // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22116   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
22117       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22118        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22119
22120     // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22121     unsigned SubVectorScale = 1;
22122     if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22123       SubVectorScale =
22124           Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22125       Amt = Amt.getOperand(0);
22126     }
22127
22128     // Peek through any splat that was introduced for i64 shift vectorization.
22129     int SplatIndex = -1;
22130     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22131       if (SVN->isSplat()) {
22132         SplatIndex = SVN->getSplatIndex();
22133         Amt = Amt.getOperand(0);
22134         assert(SplatIndex < (int)VT.getVectorNumElements() &&
22135                "Splat shuffle referencing second operand");
22136       }
22137
22138     if (Amt.getOpcode() != ISD::BITCAST ||
22139         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22140       return SDValue();
22141
22142     Amt = Amt.getOperand(0);
22143     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22144                      (SubVectorScale * VT.getVectorNumElements());
22145     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22146     uint64_t ShiftAmt = 0;
22147     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22148     for (unsigned i = 0; i != Ratio; ++i) {
22149       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22150       if (!C)
22151         return SDValue();
22152       // 6 == Log2(64)
22153       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22154     }
22155
22156     // Check remaining shift amounts (if not a splat).
22157     if (SplatIndex < 0) {
22158       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22159         uint64_t ShAmt = 0;
22160         for (unsigned j = 0; j != Ratio; ++j) {
22161           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22162           if (!C)
22163             return SDValue();
22164           // 6 == Log2(64)
22165           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22166         }
22167         if (ShAmt != ShiftAmt)
22168           return SDValue();
22169       }
22170     }
22171
22172     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22173       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22174
22175     if (Op.getOpcode() == ISD::SRA)
22176       return ArithmeticShiftRight64(ShiftAmt);
22177   }
22178
22179   return SDValue();
22180 }
22181
22182 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22183                                         const X86Subtarget &Subtarget) {
22184   MVT VT = Op.getSimpleValueType();
22185   SDLoc dl(Op);
22186   SDValue R = Op.getOperand(0);
22187   SDValue Amt = Op.getOperand(1);
22188
22189   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22190     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22191
22192   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22193     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22194
22195   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22196     SDValue BaseShAmt;
22197     MVT EltVT = VT.getVectorElementType();
22198
22199     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22200       // Check if this build_vector node is doing a splat.
22201       // If so, then set BaseShAmt equal to the splat value.
22202       BaseShAmt = BV->getSplatValue();
22203       if (BaseShAmt && BaseShAmt.isUndef())
22204         BaseShAmt = SDValue();
22205     } else {
22206       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22207         Amt = Amt.getOperand(0);
22208
22209       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22210       if (SVN && SVN->isSplat()) {
22211         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22212         SDValue InVec = Amt.getOperand(0);
22213         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22214           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22215                  "Unexpected shuffle index found!");
22216           BaseShAmt = InVec.getOperand(SplatIdx);
22217         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22218            if (ConstantSDNode *C =
22219                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22220              if (C->getZExtValue() == SplatIdx)
22221                BaseShAmt = InVec.getOperand(1);
22222            }
22223         }
22224
22225         if (!BaseShAmt)
22226           // Avoid introducing an extract element from a shuffle.
22227           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22228                                   DAG.getIntPtrConstant(SplatIdx, dl));
22229       }
22230     }
22231
22232     if (BaseShAmt.getNode()) {
22233       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22234       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22235         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22236       else if (EltVT.bitsLT(MVT::i32))
22237         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22238
22239       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22240     }
22241   }
22242
22243   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22244   if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
22245       Amt.getOpcode() == ISD::BITCAST &&
22246       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22247     Amt = Amt.getOperand(0);
22248     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22249                      VT.getVectorNumElements();
22250     std::vector<SDValue> Vals(Ratio);
22251     for (unsigned i = 0; i != Ratio; ++i)
22252       Vals[i] = Amt.getOperand(i);
22253     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22254       for (unsigned j = 0; j != Ratio; ++j)
22255         if (Vals[j] != Amt.getOperand(i + j))
22256           return SDValue();
22257     }
22258
22259     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22260       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22261   }
22262   return SDValue();
22263 }
22264
22265 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22266                           SelectionDAG &DAG) {
22267   MVT VT = Op.getSimpleValueType();
22268   SDLoc dl(Op);
22269   SDValue R = Op.getOperand(0);
22270   SDValue Amt = Op.getOperand(1);
22271   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22272
22273   assert(VT.isVector() && "Custom lowering only for vector shifts!");
22274   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22275
22276   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22277     return V;
22278
22279   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22280     return V;
22281
22282   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22283     return Op;
22284
22285   // XOP has 128-bit variable logical/arithmetic shifts.
22286   // +ve/-ve Amt = shift left/right.
22287   if (Subtarget.hasXOP() &&
22288       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22289        VT == MVT::v8i16 || VT == MVT::v16i8)) {
22290     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22291       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22292       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22293     }
22294     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22295       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22296     if (Op.getOpcode() == ISD::SRA)
22297       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22298   }
22299
22300   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22301   // shifts per-lane and then shuffle the partial results back together.
22302   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22303     // Splat the shift amounts so the scalar shifts above will catch it.
22304     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22305     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22306     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22307     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22308     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22309   }
22310
22311   // i64 vector arithmetic shift can be emulated with the transform:
22312   // M = lshr(SIGN_MASK, Amt)
22313   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22314   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22315       Op.getOpcode() == ISD::SRA) {
22316     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22317     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22318     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22319     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22320     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22321     return R;
22322   }
22323
22324   // If possible, lower this packed shift into a vector multiply instead of
22325   // expanding it into a sequence of scalar shifts.
22326   // Do this only if the vector shift count is a constant build_vector.
22327   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22328       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22329        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22330     SmallVector<SDValue, 8> Elts;
22331     MVT SVT = VT.getVectorElementType();
22332     unsigned SVTBits = SVT.getSizeInBits();
22333     APInt One(SVTBits, 1);
22334     unsigned NumElems = VT.getVectorNumElements();
22335
22336     for (unsigned i=0; i !=NumElems; ++i) {
22337       SDValue Op = Amt->getOperand(i);
22338       if (Op->isUndef()) {
22339         Elts.push_back(Op);
22340         continue;
22341       }
22342
22343       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22344       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22345       uint64_t ShAmt = C.getZExtValue();
22346       if (ShAmt >= SVTBits) {
22347         Elts.push_back(DAG.getUNDEF(SVT));
22348         continue;
22349       }
22350       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22351     }
22352     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22353     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22354   }
22355
22356   // Lower SHL with variable shift amount.
22357   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22358     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22359
22360     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22361                      DAG.getConstant(0x3f800000U, dl, VT));
22362     Op = DAG.getBitcast(MVT::v4f32, Op);
22363     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22364     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22365   }
22366
22367   // If possible, lower this shift as a sequence of two shifts by
22368   // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22369   // Example:
22370   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22371   //
22372   // Could be rewritten as:
22373   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22374   //
22375   // The advantage is that the two shifts from the example would be
22376   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22377   // the vector shift into four scalar shifts plus four pairs of vector
22378   // insert/extract.
22379   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22380     unsigned TargetOpcode = X86ISD::MOVSS;
22381     bool CanBeSimplified;
22382     // The splat value for the first packed shift (the 'X' from the example).
22383     SDValue Amt1 = Amt->getOperand(0);
22384     // The splat value for the second packed shift (the 'Y' from the example).
22385     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22386
22387     // See if it is possible to replace this node with a sequence of
22388     // two shifts followed by a MOVSS/MOVSD/PBLEND.
22389     if (VT == MVT::v4i32) {
22390       // Check if it is legal to use a MOVSS.
22391       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22392                         Amt2 == Amt->getOperand(3);
22393       if (!CanBeSimplified) {
22394         // Otherwise, check if we can still simplify this node using a MOVSD.
22395         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22396                           Amt->getOperand(2) == Amt->getOperand(3);
22397         TargetOpcode = X86ISD::MOVSD;
22398         Amt2 = Amt->getOperand(2);
22399       }
22400     } else {
22401       // Do similar checks for the case where the machine value type
22402       // is MVT::v8i16.
22403       CanBeSimplified = Amt1 == Amt->getOperand(1);
22404       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22405         CanBeSimplified = Amt2 == Amt->getOperand(i);
22406
22407       if (!CanBeSimplified) {
22408         TargetOpcode = X86ISD::MOVSD;
22409         CanBeSimplified = true;
22410         Amt2 = Amt->getOperand(4);
22411         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22412           CanBeSimplified = Amt1 == Amt->getOperand(i);
22413         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22414           CanBeSimplified = Amt2 == Amt->getOperand(j);
22415       }
22416     }
22417
22418     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22419         isa<ConstantSDNode>(Amt2)) {
22420       // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22421       MVT CastVT = MVT::v4i32;
22422       SDValue Splat1 =
22423           DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22424       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22425       SDValue Splat2 =
22426           DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22427       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22428       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22429       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22430       if (TargetOpcode == X86ISD::MOVSD)
22431         return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22432                                                        BitCast2, {0, 1, 6, 7}));
22433       return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22434                                                      BitCast2, {0, 5, 6, 7}));
22435     }
22436   }
22437
22438   // v4i32 Non Uniform Shifts.
22439   // If the shift amount is constant we can shift each lane using the SSE2
22440   // immediate shifts, else we need to zero-extend each lane to the lower i64
22441   // and shift using the SSE2 variable shifts.
22442   // The separate results can then be blended together.
22443   if (VT == MVT::v4i32) {
22444     unsigned Opc = Op.getOpcode();
22445     SDValue Amt0, Amt1, Amt2, Amt3;
22446     if (ConstantAmt) {
22447       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22448       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22449       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22450       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22451     } else {
22452       // ISD::SHL is handled above but we include it here for completeness.
22453       switch (Opc) {
22454       default:
22455         llvm_unreachable("Unknown target vector shift node");
22456       case ISD::SHL:
22457         Opc = X86ISD::VSHL;
22458         break;
22459       case ISD::SRL:
22460         Opc = X86ISD::VSRL;
22461         break;
22462       case ISD::SRA:
22463         Opc = X86ISD::VSRA;
22464         break;
22465       }
22466       // The SSE2 shifts use the lower i64 as the same shift amount for
22467       // all lanes and the upper i64 is ignored. These shuffle masks
22468       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22469       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22470       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22471       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22472       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22473       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22474     }
22475
22476     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22477     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22478     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22479     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22480     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22481     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22482     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22483   }
22484
22485   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22486   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22487   // make the existing SSE solution better.
22488   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22489       (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22490       (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22491       (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22492     MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22493     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22494     unsigned ExtOpc =
22495         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22496     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22497     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22498     return DAG.getNode(ISD::TRUNCATE, dl, VT,
22499                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22500   }
22501
22502   if (VT == MVT::v16i8 ||
22503       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22504       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22505     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22506     unsigned ShiftOpcode = Op->getOpcode();
22507
22508     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22509       if (VT.is512BitVector()) {
22510         // On AVX512BW targets we make use of the fact that VSELECT lowers
22511         // to a masked blend which selects bytes based just on the sign bit
22512         // extracted to a mask.
22513         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22514         V0 = DAG.getBitcast(VT, V0);
22515         V1 = DAG.getBitcast(VT, V1);
22516         Sel = DAG.getBitcast(VT, Sel);
22517         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22518         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22519       } else if (Subtarget.hasSSE41()) {
22520         // On SSE41 targets we make use of the fact that VSELECT lowers
22521         // to PBLENDVB which selects bytes based just on the sign bit.
22522         V0 = DAG.getBitcast(VT, V0);
22523         V1 = DAG.getBitcast(VT, V1);
22524         Sel = DAG.getBitcast(VT, Sel);
22525         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22526       }
22527       // On pre-SSE41 targets we test for the sign bit by comparing to
22528       // zero - a negative value will set all bits of the lanes to true
22529       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22530       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22531       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22532       return DAG.getSelect(dl, SelVT, C, V0, V1);
22533     };
22534
22535     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22536     // We can safely do this using i16 shifts as we're only interested in
22537     // the 3 lower bits of each byte.
22538     Amt = DAG.getBitcast(ExtVT, Amt);
22539     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22540     Amt = DAG.getBitcast(VT, Amt);
22541
22542     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22543       // r = VSELECT(r, shift(r, 4), a);
22544       SDValue M =
22545           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22546       R = SignBitSelect(VT, Amt, M, R);
22547
22548       // a += a
22549       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22550
22551       // r = VSELECT(r, shift(r, 2), a);
22552       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22553       R = SignBitSelect(VT, Amt, M, R);
22554
22555       // a += a
22556       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22557
22558       // return VSELECT(r, shift(r, 1), a);
22559       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22560       R = SignBitSelect(VT, Amt, M, R);
22561       return R;
22562     }
22563
22564     if (Op->getOpcode() == ISD::SRA) {
22565       // For SRA we need to unpack each byte to the higher byte of a i16 vector
22566       // so we can correctly sign extend. We don't care what happens to the
22567       // lower byte.
22568       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22569       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22570       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22571       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22572       ALo = DAG.getBitcast(ExtVT, ALo);
22573       AHi = DAG.getBitcast(ExtVT, AHi);
22574       RLo = DAG.getBitcast(ExtVT, RLo);
22575       RHi = DAG.getBitcast(ExtVT, RHi);
22576
22577       // r = VSELECT(r, shift(r, 4), a);
22578       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22579                                 DAG.getConstant(4, dl, ExtVT));
22580       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22581                                 DAG.getConstant(4, dl, ExtVT));
22582       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22583       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22584
22585       // a += a
22586       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22587       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22588
22589       // r = VSELECT(r, shift(r, 2), a);
22590       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22591                         DAG.getConstant(2, dl, ExtVT));
22592       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22593                         DAG.getConstant(2, dl, ExtVT));
22594       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22595       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22596
22597       // a += a
22598       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22599       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22600
22601       // r = VSELECT(r, shift(r, 1), a);
22602       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22603                         DAG.getConstant(1, dl, ExtVT));
22604       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22605                         DAG.getConstant(1, dl, ExtVT));
22606       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22607       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22608
22609       // Logical shift the result back to the lower byte, leaving a zero upper
22610       // byte
22611       // meaning that we can safely pack with PACKUSWB.
22612       RLo =
22613           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22614       RHi =
22615           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22616       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22617     }
22618   }
22619
22620   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22621     MVT ExtVT = MVT::v8i32;
22622     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22623     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22624     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22625     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22626     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22627     ALo = DAG.getBitcast(ExtVT, ALo);
22628     AHi = DAG.getBitcast(ExtVT, AHi);
22629     RLo = DAG.getBitcast(ExtVT, RLo);
22630     RHi = DAG.getBitcast(ExtVT, RHi);
22631     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22632     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22633     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22634     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22635     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22636   }
22637
22638   if (VT == MVT::v8i16) {
22639     unsigned ShiftOpcode = Op->getOpcode();
22640
22641     // If we have a constant shift amount, the non-SSE41 path is best as
22642     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22643     bool UseSSE41 = Subtarget.hasSSE41() &&
22644                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22645
22646     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22647       // On SSE41 targets we make use of the fact that VSELECT lowers
22648       // to PBLENDVB which selects bytes based just on the sign bit.
22649       if (UseSSE41) {
22650         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22651         V0 = DAG.getBitcast(ExtVT, V0);
22652         V1 = DAG.getBitcast(ExtVT, V1);
22653         Sel = DAG.getBitcast(ExtVT, Sel);
22654         return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22655       }
22656       // On pre-SSE41 targets we splat the sign bit - a negative value will
22657       // set all bits of the lanes to true and VSELECT uses that in
22658       // its OR(AND(V0,C),AND(V1,~C)) lowering.
22659       SDValue C =
22660           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22661       return DAG.getSelect(dl, VT, C, V0, V1);
22662     };
22663
22664     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22665     if (UseSSE41) {
22666       // On SSE41 targets we need to replicate the shift mask in both
22667       // bytes for PBLENDVB.
22668       Amt = DAG.getNode(
22669           ISD::OR, dl, VT,
22670           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22671           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22672     } else {
22673       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22674     }
22675
22676     // r = VSELECT(r, shift(r, 8), a);
22677     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22678     R = SignBitSelect(Amt, M, R);
22679
22680     // a += a
22681     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22682
22683     // r = VSELECT(r, shift(r, 4), a);
22684     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22685     R = SignBitSelect(Amt, M, R);
22686
22687     // a += a
22688     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22689
22690     // r = VSELECT(r, shift(r, 2), a);
22691     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22692     R = SignBitSelect(Amt, M, R);
22693
22694     // a += a
22695     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22696
22697     // return VSELECT(r, shift(r, 1), a);
22698     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22699     R = SignBitSelect(Amt, M, R);
22700     return R;
22701   }
22702
22703   // Decompose 256-bit shifts into smaller 128-bit shifts.
22704   if (VT.is256BitVector())
22705     return Lower256IntArith(Op, DAG);
22706
22707   return SDValue();
22708 }
22709
22710 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22711                            SelectionDAG &DAG) {
22712   MVT VT = Op.getSimpleValueType();
22713   SDLoc DL(Op);
22714   SDValue R = Op.getOperand(0);
22715   SDValue Amt = Op.getOperand(1);
22716   unsigned Opcode = Op.getOpcode();
22717   unsigned EltSizeInBits = VT.getScalarSizeInBits();
22718
22719   if (Subtarget.hasAVX512()) {
22720     // Attempt to rotate by immediate.
22721     APInt UndefElts;
22722     SmallVector<APInt, 16> EltBits;
22723     if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
22724       if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
22725             return EltBits[0] == V;
22726           })) {
22727         unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
22728         uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
22729         return DAG.getNode(Op, DL, VT, R,
22730                            DAG.getConstant(RotateAmt, DL, MVT::i8));
22731       }
22732     }
22733
22734     // Else, fall-back on VPROLV/VPRORV.
22735     return Op;
22736   }
22737
22738   assert(VT.isVector() && "Custom lowering only for vector rotates!");
22739   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22740   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
22741
22742   // XOP has 128-bit vector variable + immediate rotates.
22743   // +ve/-ve Amt = rotate left/right.
22744
22745   // Split 256-bit integers.
22746   if (VT.is256BitVector())
22747     return Lower256IntArith(Op, DAG);
22748
22749   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22750
22751   // Attempt to rotate by immediate.
22752   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22753     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22754       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22755       assert(RotateAmt < EltSizeInBits && "Rotation out of range");
22756       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22757                          DAG.getConstant(RotateAmt, DL, MVT::i8));
22758     }
22759   }
22760
22761   // Use general rotate by variable (per-element).
22762   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22763 }
22764
22765 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22766   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22767   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22768   // looks for this combo and may remove the "setcc" instruction if the "setcc"
22769   // has only one use.
22770   SDNode *N = Op.getNode();
22771   SDValue LHS = N->getOperand(0);
22772   SDValue RHS = N->getOperand(1);
22773   unsigned BaseOp = 0;
22774   X86::CondCode Cond;
22775   SDLoc DL(Op);
22776   switch (Op.getOpcode()) {
22777   default: llvm_unreachable("Unknown ovf instruction!");
22778   case ISD::SADDO:
22779     // A subtract of one will be selected as a INC. Note that INC doesn't
22780     // set CF, so we can't do this for UADDO.
22781     if (isOneConstant(RHS)) {
22782       BaseOp = X86ISD::INC;
22783       Cond = X86::COND_O;
22784       break;
22785     }
22786     BaseOp = X86ISD::ADD;
22787     Cond = X86::COND_O;
22788     break;
22789   case ISD::UADDO:
22790     BaseOp = X86ISD::ADD;
22791     Cond = X86::COND_B;
22792     break;
22793   case ISD::SSUBO:
22794     // A subtract of one will be selected as a DEC. Note that DEC doesn't
22795     // set CF, so we can't do this for USUBO.
22796     if (isOneConstant(RHS)) {
22797       BaseOp = X86ISD::DEC;
22798       Cond = X86::COND_O;
22799       break;
22800     }
22801     BaseOp = X86ISD::SUB;
22802     Cond = X86::COND_O;
22803     break;
22804   case ISD::USUBO:
22805     BaseOp = X86ISD::SUB;
22806     Cond = X86::COND_B;
22807     break;
22808   case ISD::SMULO:
22809     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22810     Cond = X86::COND_O;
22811     break;
22812   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22813     if (N->getValueType(0) == MVT::i8) {
22814       BaseOp = X86ISD::UMUL8;
22815       Cond = X86::COND_O;
22816       break;
22817     }
22818     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22819                                  MVT::i32);
22820     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22821
22822     SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22823
22824     if (N->getValueType(1) == MVT::i1)
22825       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22826
22827     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22828   }
22829   }
22830
22831   // Also sets EFLAGS.
22832   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22833   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22834
22835   SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22836
22837   if (N->getValueType(1) == MVT::i1)
22838     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22839
22840   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22841 }
22842
22843 /// Returns true if the operand type is exactly twice the native width, and
22844 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22845 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22846 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22847 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22848   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22849
22850   if (OpWidth == 64)
22851     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22852   else if (OpWidth == 128)
22853     return Subtarget.hasCmpxchg16b();
22854   else
22855     return false;
22856 }
22857
22858 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22859   return needsCmpXchgNb(SI->getValueOperand()->getType());
22860 }
22861
22862 // Note: this turns large loads into lock cmpxchg8b/16b.
22863 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22864 TargetLowering::AtomicExpansionKind
22865 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22866   auto PTy = cast<PointerType>(LI->getPointerOperandType());
22867   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22868                                                : AtomicExpansionKind::None;
22869 }
22870
22871 TargetLowering::AtomicExpansionKind
22872 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22873   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22874   Type *MemType = AI->getType();
22875
22876   // If the operand is too big, we must see if cmpxchg8/16b is available
22877   // and default to library calls otherwise.
22878   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22879     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22880                                    : AtomicExpansionKind::None;
22881   }
22882
22883   AtomicRMWInst::BinOp Op = AI->getOperation();
22884   switch (Op) {
22885   default:
22886     llvm_unreachable("Unknown atomic operation");
22887   case AtomicRMWInst::Xchg:
22888   case AtomicRMWInst::Add:
22889   case AtomicRMWInst::Sub:
22890     // It's better to use xadd, xsub or xchg for these in all cases.
22891     return AtomicExpansionKind::None;
22892   case AtomicRMWInst::Or:
22893   case AtomicRMWInst::And:
22894   case AtomicRMWInst::Xor:
22895     // If the atomicrmw's result isn't actually used, we can just add a "lock"
22896     // prefix to a normal instruction for these operations.
22897     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22898                             : AtomicExpansionKind::None;
22899   case AtomicRMWInst::Nand:
22900   case AtomicRMWInst::Max:
22901   case AtomicRMWInst::Min:
22902   case AtomicRMWInst::UMax:
22903   case AtomicRMWInst::UMin:
22904     // These always require a non-trivial set of data operations on x86. We must
22905     // use a cmpxchg loop.
22906     return AtomicExpansionKind::CmpXChg;
22907   }
22908 }
22909
22910 LoadInst *
22911 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22912   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22913   Type *MemType = AI->getType();
22914   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22915   // there is no benefit in turning such RMWs into loads, and it is actually
22916   // harmful as it introduces a mfence.
22917   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22918     return nullptr;
22919
22920   auto Builder = IRBuilder<>(AI);
22921   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22922   auto SSID = AI->getSyncScopeID();
22923   // We must restrict the ordering to avoid generating loads with Release or
22924   // ReleaseAcquire orderings.
22925   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22926   auto Ptr = AI->getPointerOperand();
22927
22928   // Before the load we need a fence. Here is an example lifted from
22929   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22930   // is required:
22931   // Thread 0:
22932   //   x.store(1, relaxed);
22933   //   r1 = y.fetch_add(0, release);
22934   // Thread 1:
22935   //   y.fetch_add(42, acquire);
22936   //   r2 = x.load(relaxed);
22937   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22938   // lowered to just a load without a fence. A mfence flushes the store buffer,
22939   // making the optimization clearly correct.
22940   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22941   // otherwise, we might be able to be more aggressive on relaxed idempotent
22942   // rmw. In practice, they do not look useful, so we don't try to be
22943   // especially clever.
22944   if (SSID == SyncScope::SingleThread)
22945     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22946     // the IR level, so we must wrap it in an intrinsic.
22947     return nullptr;
22948
22949   if (!Subtarget.hasMFence())
22950     // FIXME: it might make sense to use a locked operation here but on a
22951     // different cache-line to prevent cache-line bouncing. In practice it
22952     // is probably a small win, and x86 processors without mfence are rare
22953     // enough that we do not bother.
22954     return nullptr;
22955
22956   Function *MFence =
22957       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22958   Builder.CreateCall(MFence, {});
22959
22960   // Finally we can emit the atomic load.
22961   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22962           AI->getType()->getPrimitiveSizeInBits());
22963   Loaded->setAtomic(Order, SSID);
22964   AI->replaceAllUsesWith(Loaded);
22965   AI->eraseFromParent();
22966   return Loaded;
22967 }
22968
22969 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22970                                  SelectionDAG &DAG) {
22971   SDLoc dl(Op);
22972   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22973     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22974   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
22975     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22976
22977   // The only fence that needs an instruction is a sequentially-consistent
22978   // cross-thread fence.
22979   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22980       FenceSSID == SyncScope::System) {
22981     if (Subtarget.hasMFence())
22982       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22983
22984     SDValue Chain = Op.getOperand(0);
22985     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22986     SDValue Ops[] = {
22987       DAG.getRegister(X86::ESP, MVT::i32),     // Base
22988       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
22989       DAG.getRegister(0, MVT::i32),            // Index
22990       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
22991       DAG.getRegister(0, MVT::i32),            // Segment.
22992       Zero,
22993       Chain
22994     };
22995     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22996     return SDValue(Res, 0);
22997   }
22998
22999   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23000   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23001 }
23002
23003 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23004                              SelectionDAG &DAG) {
23005   MVT T = Op.getSimpleValueType();
23006   SDLoc DL(Op);
23007   unsigned Reg = 0;
23008   unsigned size = 0;
23009   switch(T.SimpleTy) {
23010   default: llvm_unreachable("Invalid value type!");
23011   case MVT::i8:  Reg = X86::AL;  size = 1; break;
23012   case MVT::i16: Reg = X86::AX;  size = 2; break;
23013   case MVT::i32: Reg = X86::EAX; size = 4; break;
23014   case MVT::i64:
23015     assert(Subtarget.is64Bit() && "Node not type legal!");
23016     Reg = X86::RAX; size = 8;
23017     break;
23018   }
23019   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23020                                   Op.getOperand(2), SDValue());
23021   SDValue Ops[] = { cpIn.getValue(0),
23022                     Op.getOperand(1),
23023                     Op.getOperand(3),
23024                     DAG.getTargetConstant(size, DL, MVT::i8),
23025                     cpIn.getValue(1) };
23026   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23027   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23028   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23029                                            Ops, T, MMO);
23030
23031   SDValue cpOut =
23032     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23033   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23034                                       MVT::i32, cpOut.getValue(2));
23035   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23036
23037   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23038   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23039   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23040   return SDValue();
23041 }
23042
23043 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23044                             SelectionDAG &DAG) {
23045   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23046   MVT DstVT = Op.getSimpleValueType();
23047
23048   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23049       SrcVT == MVT::i64) {
23050     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23051     if (DstVT != MVT::f64)
23052       // This conversion needs to be expanded.
23053       return SDValue();
23054
23055     SDValue Op0 = Op->getOperand(0);
23056     SmallVector<SDValue, 16> Elts;
23057     SDLoc dl(Op);
23058     unsigned NumElts;
23059     MVT SVT;
23060     if (SrcVT.isVector()) {
23061       NumElts = SrcVT.getVectorNumElements();
23062       SVT = SrcVT.getVectorElementType();
23063
23064       // Widen the vector in input in the case of MVT::v2i32.
23065       // Example: from MVT::v2i32 to MVT::v4i32.
23066       for (unsigned i = 0, e = NumElts; i != e; ++i)
23067         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23068                                    DAG.getIntPtrConstant(i, dl)));
23069     } else {
23070       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23071              "Unexpected source type in LowerBITCAST");
23072       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23073                                  DAG.getIntPtrConstant(0, dl)));
23074       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23075                                  DAG.getIntPtrConstant(1, dl)));
23076       NumElts = 2;
23077       SVT = MVT::i32;
23078     }
23079     // Explicitly mark the extra elements as Undef.
23080     Elts.append(NumElts, DAG.getUNDEF(SVT));
23081
23082     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23083     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23084     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23085     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23086                        DAG.getIntPtrConstant(0, dl));
23087   }
23088
23089   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23090          Subtarget.hasMMX() && "Unexpected custom BITCAST");
23091   assert((DstVT == MVT::i64 ||
23092           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23093          "Unexpected custom BITCAST");
23094   // i64 <=> MMX conversions are Legal.
23095   if (SrcVT==MVT::i64 && DstVT.isVector())
23096     return Op;
23097   if (DstVT==MVT::i64 && SrcVT.isVector())
23098     return Op;
23099   // MMX <=> MMX conversions are Legal.
23100   if (SrcVT.isVector() && DstVT.isVector())
23101     return Op;
23102   // All other conversions need to be expanded.
23103   return SDValue();
23104 }
23105
23106 /// Compute the horizontal sum of bytes in V for the elements of VT.
23107 ///
23108 /// Requires V to be a byte vector and VT to be an integer vector type with
23109 /// wider elements than V's type. The width of the elements of VT determines
23110 /// how many bytes of V are summed horizontally to produce each element of the
23111 /// result.
23112 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23113                                       const X86Subtarget &Subtarget,
23114                                       SelectionDAG &DAG) {
23115   SDLoc DL(V);
23116   MVT ByteVecVT = V.getSimpleValueType();
23117   MVT EltVT = VT.getVectorElementType();
23118   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23119          "Expected value to have byte element type.");
23120   assert(EltVT != MVT::i8 &&
23121          "Horizontal byte sum only makes sense for wider elements!");
23122   unsigned VecSize = VT.getSizeInBits();
23123   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23124
23125   // PSADBW instruction horizontally add all bytes and leave the result in i64
23126   // chunks, thus directly computes the pop count for v2i64 and v4i64.
23127   if (EltVT == MVT::i64) {
23128     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23129     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23130     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23131     return DAG.getBitcast(VT, V);
23132   }
23133
23134   if (EltVT == MVT::i32) {
23135     // We unpack the low half and high half into i32s interleaved with zeros so
23136     // that we can use PSADBW to horizontally sum them. The most useful part of
23137     // this is that it lines up the results of two PSADBW instructions to be
23138     // two v2i64 vectors which concatenated are the 4 population counts. We can
23139     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23140     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23141     SDValue V32 = DAG.getBitcast(VT, V);
23142     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23143     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23144
23145     // Do the horizontal sums into two v2i64s.
23146     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23147     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23148     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23149                       DAG.getBitcast(ByteVecVT, Low), Zeros);
23150     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23151                        DAG.getBitcast(ByteVecVT, High), Zeros);
23152
23153     // Merge them together.
23154     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23155     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23156                     DAG.getBitcast(ShortVecVT, Low),
23157                     DAG.getBitcast(ShortVecVT, High));
23158
23159     return DAG.getBitcast(VT, V);
23160   }
23161
23162   // The only element type left is i16.
23163   assert(EltVT == MVT::i16 && "Unknown how to handle type");
23164
23165   // To obtain pop count for each i16 element starting from the pop count for
23166   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23167   // right by 8. It is important to shift as i16s as i8 vector shift isn't
23168   // directly supported.
23169   SDValue ShifterV = DAG.getConstant(8, DL, VT);
23170   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23171   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23172                   DAG.getBitcast(ByteVecVT, V));
23173   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23174 }
23175
23176 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23177                                         const X86Subtarget &Subtarget,
23178                                         SelectionDAG &DAG) {
23179   MVT VT = Op.getSimpleValueType();
23180   MVT EltVT = VT.getVectorElementType();
23181   unsigned VecSize = VT.getSizeInBits();
23182
23183   // Implement a lookup table in register by using an algorithm based on:
23184   // http://wm.ite.pl/articles/sse-popcount.html
23185   //
23186   // The general idea is that every lower byte nibble in the input vector is an
23187   // index into a in-register pre-computed pop count table. We then split up the
23188   // input vector in two new ones: (1) a vector with only the shifted-right
23189   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23190   // masked out higher ones) for each byte. PSHUFB is used separately with both
23191   // to index the in-register table. Next, both are added and the result is a
23192   // i8 vector where each element contains the pop count for input byte.
23193   //
23194   // To obtain the pop count for elements != i8, we follow up with the same
23195   // approach and use additional tricks as described below.
23196   //
23197   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23198                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23199                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23200                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23201
23202   int NumByteElts = VecSize / 8;
23203   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23204   SDValue In = DAG.getBitcast(ByteVecVT, Op);
23205   SmallVector<SDValue, 64> LUTVec;
23206   for (int i = 0; i < NumByteElts; ++i)
23207     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23208   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23209   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23210
23211   // High nibbles
23212   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23213   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23214
23215   // Low nibbles
23216   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23217
23218   // The input vector is used as the shuffle mask that index elements into the
23219   // LUT. After counting low and high nibbles, add the vector to obtain the
23220   // final pop count per i8 element.
23221   SDValue HighPopCnt =
23222       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23223   SDValue LowPopCnt =
23224       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23225   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23226
23227   if (EltVT == MVT::i8)
23228     return PopCnt;
23229
23230   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23231 }
23232
23233 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23234                                        const X86Subtarget &Subtarget,
23235                                        SelectionDAG &DAG) {
23236   MVT VT = Op.getSimpleValueType();
23237   assert(VT.is128BitVector() &&
23238          "Only 128-bit vector bitmath lowering supported.");
23239
23240   int VecSize = VT.getSizeInBits();
23241   MVT EltVT = VT.getVectorElementType();
23242   int Len = EltVT.getSizeInBits();
23243
23244   // This is the vectorized version of the "best" algorithm from
23245   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23246   // with a minor tweak to use a series of adds + shifts instead of vector
23247   // multiplications. Implemented for all integer vector types. We only use
23248   // this when we don't have SSSE3 which allows a LUT-based lowering that is
23249   // much faster, even faster than using native popcnt instructions.
23250
23251   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23252     MVT VT = V.getSimpleValueType();
23253     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23254     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23255   };
23256   auto GetMask = [&](SDValue V, APInt Mask) {
23257     MVT VT = V.getSimpleValueType();
23258     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23259     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23260   };
23261
23262   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23263   // x86, so set the SRL type to have elements at least i16 wide. This is
23264   // correct because all of our SRLs are followed immediately by a mask anyways
23265   // that handles any bits that sneak into the high bits of the byte elements.
23266   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23267
23268   SDValue V = Op;
23269
23270   // v = v - ((v >> 1) & 0x55555555...)
23271   SDValue Srl =
23272       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23273   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23274   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23275
23276   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23277   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23278   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23279   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23280   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23281
23282   // v = (v + (v >> 4)) & 0x0F0F0F0F...
23283   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23284   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23285   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23286
23287   // At this point, V contains the byte-wise population count, and we are
23288   // merely doing a horizontal sum if necessary to get the wider element
23289   // counts.
23290   if (EltVT == MVT::i8)
23291     return V;
23292
23293   return LowerHorizontalByteSum(
23294       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23295       DAG);
23296 }
23297
23298 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23299 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23300 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23301                                 SelectionDAG &DAG) {
23302   MVT VT = Op.getSimpleValueType();
23303   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23304          "Unknown CTPOP type to handle");
23305   SDLoc DL(Op.getNode());
23306   SDValue Op0 = Op.getOperand(0);
23307
23308   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23309   if (Subtarget.hasVPOPCNTDQ()) {
23310     if (VT == MVT::v8i16) {
23311       Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23312       Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23313       return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23314     }
23315     if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23316       Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23317       Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23318       return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23319     }
23320   }
23321
23322   if (!Subtarget.hasSSSE3()) {
23323     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23324     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23325     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23326   }
23327
23328   // Decompose 256-bit ops into smaller 128-bit ops.
23329   if (VT.is256BitVector() && !Subtarget.hasInt256())
23330     return Lower256IntUnary(Op, DAG);
23331
23332   // Decompose 512-bit ops into smaller 256-bit ops.
23333   if (VT.is512BitVector() && !Subtarget.hasBWI())
23334     return Lower512IntUnary(Op, DAG);
23335
23336   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23337 }
23338
23339 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23340                           SelectionDAG &DAG) {
23341   assert(Op.getSimpleValueType().isVector() &&
23342          "We only do custom lowering for vector population count.");
23343   return LowerVectorCTPOP(Op, Subtarget, DAG);
23344 }
23345
23346 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23347   MVT VT = Op.getSimpleValueType();
23348   SDValue In = Op.getOperand(0);
23349   SDLoc DL(Op);
23350
23351   // For scalars, its still beneficial to transfer to/from the SIMD unit to
23352   // perform the BITREVERSE.
23353   if (!VT.isVector()) {
23354     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23355     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23356     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23357     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23358                        DAG.getIntPtrConstant(0, DL));
23359   }
23360
23361   int NumElts = VT.getVectorNumElements();
23362   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23363
23364   // Decompose 256-bit ops into smaller 128-bit ops.
23365   if (VT.is256BitVector())
23366     return Lower256IntUnary(Op, DAG);
23367
23368   assert(VT.is128BitVector() &&
23369          "Only 128-bit vector bitreverse lowering supported.");
23370
23371   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23372   // perform the BSWAP in the shuffle.
23373   // Its best to shuffle using the second operand as this will implicitly allow
23374   // memory folding for multiple vectors.
23375   SmallVector<SDValue, 16> MaskElts;
23376   for (int i = 0; i != NumElts; ++i) {
23377     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23378       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23379       int PermuteByte = SourceByte | (2 << 5);
23380       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23381     }
23382   }
23383
23384   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23385   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23386   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23387                     Res, Mask);
23388   return DAG.getBitcast(VT, Res);
23389 }
23390
23391 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23392                                SelectionDAG &DAG) {
23393   if (Subtarget.hasXOP())
23394     return LowerBITREVERSE_XOP(Op, DAG);
23395
23396   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23397
23398   MVT VT = Op.getSimpleValueType();
23399   SDValue In = Op.getOperand(0);
23400   SDLoc DL(Op);
23401
23402   unsigned NumElts = VT.getVectorNumElements();
23403   assert(VT.getScalarType() == MVT::i8 &&
23404          "Only byte vector BITREVERSE supported");
23405
23406   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23407   if (VT.is256BitVector() && !Subtarget.hasInt256())
23408     return Lower256IntUnary(Op, DAG);
23409
23410   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23411   // two nibbles and a PSHUFB lookup to find the bitreverse of each
23412   // 0-15 value (moved to the other nibble).
23413   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23414   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23415   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23416
23417   const int LoLUT[16] = {
23418       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23419       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23420       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23421       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23422   const int HiLUT[16] = {
23423       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23424       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23425       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23426       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23427
23428   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23429   for (unsigned i = 0; i < NumElts; ++i) {
23430     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23431     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23432   }
23433
23434   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23435   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23436   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23437   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23438   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23439 }
23440
23441 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23442   unsigned NewOpc = 0;
23443   switch (N->getOpcode()) {
23444   case ISD::ATOMIC_LOAD_ADD:
23445     NewOpc = X86ISD::LADD;
23446     break;
23447   case ISD::ATOMIC_LOAD_SUB:
23448     NewOpc = X86ISD::LSUB;
23449     break;
23450   case ISD::ATOMIC_LOAD_OR:
23451     NewOpc = X86ISD::LOR;
23452     break;
23453   case ISD::ATOMIC_LOAD_XOR:
23454     NewOpc = X86ISD::LXOR;
23455     break;
23456   case ISD::ATOMIC_LOAD_AND:
23457     NewOpc = X86ISD::LAND;
23458     break;
23459   default:
23460     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23461   }
23462
23463   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23464   return DAG.getMemIntrinsicNode(
23465       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23466       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23467       /*MemVT=*/N->getSimpleValueType(0), MMO);
23468 }
23469
23470 /// Lower atomic_load_ops into LOCK-prefixed operations.
23471 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23472                                 const X86Subtarget &Subtarget) {
23473   SDValue Chain = N->getOperand(0);
23474   SDValue LHS = N->getOperand(1);
23475   SDValue RHS = N->getOperand(2);
23476   unsigned Opc = N->getOpcode();
23477   MVT VT = N->getSimpleValueType(0);
23478   SDLoc DL(N);
23479
23480   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23481   // can only be lowered when the result is unused.  They should have already
23482   // been transformed into a cmpxchg loop in AtomicExpand.
23483   if (N->hasAnyUseOfValue(0)) {
23484     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23485     // select LXADD if LOCK_SUB can't be selected.
23486     if (Opc == ISD::ATOMIC_LOAD_SUB) {
23487       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23488       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23489       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23490                            RHS, AN->getMemOperand());
23491     }
23492     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23493            "Used AtomicRMW ops other than Add should have been expanded!");
23494     return N;
23495   }
23496
23497   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23498   // RAUW the chain, but don't worry about the result, as it's unused.
23499   assert(!N->hasAnyUseOfValue(0));
23500   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23501   return SDValue();
23502 }
23503
23504 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23505   SDNode *Node = Op.getNode();
23506   SDLoc dl(Node);
23507   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23508
23509   // Convert seq_cst store -> xchg
23510   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23511   // FIXME: On 32-bit, store -> fist or movq would be more efficient
23512   //        (The only way to get a 16-byte store is cmpxchg16b)
23513   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23514   if (cast<AtomicSDNode>(Node)->getOrdering() ==
23515           AtomicOrdering::SequentiallyConsistent ||
23516       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23517     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23518                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
23519                                  Node->getOperand(0),
23520                                  Node->getOperand(1), Node->getOperand(2),
23521                                  cast<AtomicSDNode>(Node)->getMemOperand());
23522     return Swap.getValue(1);
23523   }
23524   // Other atomic stores have a simple pattern.
23525   return Op;
23526 }
23527
23528 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23529   SDNode *N = Op.getNode();
23530   MVT VT = N->getSimpleValueType(0);
23531
23532   // Let legalize expand this if it isn't a legal type yet.
23533   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23534     return SDValue();
23535
23536   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23537   SDLoc DL(N);
23538
23539   // Set the carry flag.
23540   SDValue Carry = Op.getOperand(2);
23541   EVT CarryVT = Carry.getValueType();
23542   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23543   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23544                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
23545
23546   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23547   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23548                             Op.getOperand(1), Carry.getValue(1));
23549
23550   SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23551   if (N->getValueType(1) == MVT::i1)
23552     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23553
23554   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23555 }
23556
23557 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23558                             SelectionDAG &DAG) {
23559   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23560
23561   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23562   // which returns the values as { float, float } (in XMM0) or
23563   // { double, double } (which is returned in XMM0, XMM1).
23564   SDLoc dl(Op);
23565   SDValue Arg = Op.getOperand(0);
23566   EVT ArgVT = Arg.getValueType();
23567   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23568
23569   TargetLowering::ArgListTy Args;
23570   TargetLowering::ArgListEntry Entry;
23571
23572   Entry.Node = Arg;
23573   Entry.Ty = ArgTy;
23574   Entry.IsSExt = false;
23575   Entry.IsZExt = false;
23576   Args.push_back(Entry);
23577
23578   bool isF64 = ArgVT == MVT::f64;
23579   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23580   // the small struct {f32, f32} is returned in (eax, edx). For f64,
23581   // the results are returned via SRet in memory.
23582   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
23583   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23584   SDValue Callee =
23585       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23586
23587   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23588                       : (Type *)VectorType::get(ArgTy, 4);
23589
23590   TargetLowering::CallLoweringInfo CLI(DAG);
23591   CLI.setDebugLoc(dl)
23592       .setChain(DAG.getEntryNode())
23593       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23594
23595   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23596
23597   if (isF64)
23598     // Returned in xmm0 and xmm1.
23599     return CallResult.first;
23600
23601   // Returned in bits 0:31 and 32:64 xmm0.
23602   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23603                                CallResult.first, DAG.getIntPtrConstant(0, dl));
23604   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23605                                CallResult.first, DAG.getIntPtrConstant(1, dl));
23606   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23607   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23608 }
23609
23610 /// Widen a vector input to a vector of NVT.  The
23611 /// input vector must have the same element type as NVT.
23612 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23613                             bool FillWithZeroes = false) {
23614   // Check if InOp already has the right width.
23615   MVT InVT = InOp.getSimpleValueType();
23616   if (InVT == NVT)
23617     return InOp;
23618
23619   if (InOp.isUndef())
23620     return DAG.getUNDEF(NVT);
23621
23622   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23623          "input and widen element type must match");
23624
23625   unsigned InNumElts = InVT.getVectorNumElements();
23626   unsigned WidenNumElts = NVT.getVectorNumElements();
23627   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23628          "Unexpected request for vector widening");
23629
23630   SDLoc dl(InOp);
23631   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23632       InOp.getNumOperands() == 2) {
23633     SDValue N1 = InOp.getOperand(1);
23634     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23635         N1.isUndef()) {
23636       InOp = InOp.getOperand(0);
23637       InVT = InOp.getSimpleValueType();
23638       InNumElts = InVT.getVectorNumElements();
23639     }
23640   }
23641   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23642       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23643     SmallVector<SDValue, 16> Ops;
23644     for (unsigned i = 0; i < InNumElts; ++i)
23645       Ops.push_back(InOp.getOperand(i));
23646
23647     EVT EltVT = InOp.getOperand(0).getValueType();
23648
23649     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23650       DAG.getUNDEF(EltVT);
23651     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23652       Ops.push_back(FillVal);
23653     return DAG.getBuildVector(NVT, dl, Ops);
23654   }
23655   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23656     DAG.getUNDEF(NVT);
23657   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23658                      InOp, DAG.getIntPtrConstant(0, dl));
23659 }
23660
23661 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23662                              SelectionDAG &DAG) {
23663   assert(Subtarget.hasAVX512() &&
23664          "MGATHER/MSCATTER are supported on AVX-512 arch only");
23665
23666   // X86 scatter kills mask register, so its type should be added to
23667   // the list of return values.
23668   // If the "scatter" has 2 return values, it is already handled.
23669   if (Op.getNode()->getNumValues() == 2)
23670     return Op;
23671
23672   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23673   SDValue Src = N->getValue();
23674   MVT VT = Src.getSimpleValueType();
23675   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23676   SDLoc dl(Op);
23677
23678   SDValue NewScatter;
23679   SDValue Index = N->getIndex();
23680   SDValue Mask = N->getMask();
23681   SDValue Chain = N->getChain();
23682   SDValue BasePtr = N->getBasePtr();
23683   MVT MemVT = N->getMemoryVT().getSimpleVT();
23684   MVT IndexVT = Index.getSimpleValueType();
23685   MVT MaskVT = Mask.getSimpleValueType();
23686
23687   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23688     // The v2i32 value was promoted to v2i64.
23689     // Now we "redo" the type legalizer's work and widen the original
23690     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23691     // with a shuffle.
23692     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23693            "Unexpected memory type");
23694     int ShuffleMask[] = {0, 2, -1, -1};
23695     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23696                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23697     // Now we have 4 elements instead of 2.
23698     // Expand the index.
23699     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23700     Index = ExtendToType(Index, NewIndexVT, DAG);
23701
23702     // Expand the mask with zeroes
23703     // Mask may be <2 x i64> or <2 x i1> at this moment
23704     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23705            "Unexpected mask type");
23706     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23707     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23708     VT = MVT::v4i32;
23709   }
23710
23711   unsigned NumElts = VT.getVectorNumElements();
23712   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23713       !Index.getSimpleValueType().is512BitVector()) {
23714     // AVX512F supports only 512-bit vectors. Or data or index should
23715     // be 512 bit wide. If now the both index and data are 256-bit, but
23716     // the vector contains 8 elements, we just sign-extend the index
23717     if (IndexVT == MVT::v8i32)
23718       // Just extend index
23719       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23720     else {
23721       // The minimal number of elts in scatter is 8
23722       NumElts = 8;
23723       // Index
23724       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23725       // Use original index here, do not modify the index twice
23726       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23727       if (IndexVT.getScalarType() == MVT::i32)
23728         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23729
23730       // Mask
23731       // At this point we have promoted mask operand
23732       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23733       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23734       // Use the original mask here, do not modify the mask twice
23735       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23736
23737       // The value that should be stored
23738       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23739       Src = ExtendToType(Src, NewVT, DAG);
23740     }
23741   }
23742   // If the mask is "wide" at this point - truncate it to i1 vector
23743   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23744   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23745
23746   // The mask is killed by scatter, add it to the values
23747   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23748   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23749   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23750                                     N->getMemOperand());
23751   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23752   return SDValue(NewScatter.getNode(), 1);
23753 }
23754
23755 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23756                           SelectionDAG &DAG) {
23757
23758   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23759   MVT VT = Op.getSimpleValueType();
23760   MVT ScalarVT = VT.getScalarType();
23761   SDValue Mask = N->getMask();
23762   SDLoc dl(Op);
23763
23764   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23765          "Expanding masked load is supported on AVX-512 target only!");
23766
23767   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23768          "Expanding masked load is supported for 32 and 64-bit types only!");
23769
23770   // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23771   // VLX. These types for exp-loads are handled here.
23772   if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23773     return Op;
23774
23775   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23776          "Cannot lower masked load op.");
23777
23778   assert((ScalarVT.getSizeInBits() >= 32 ||
23779           (Subtarget.hasBWI() &&
23780               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23781          "Unsupported masked load op.");
23782
23783   // This operation is legal for targets with VLX, but without
23784   // VLX the vector should be widened to 512 bit
23785   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23786   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23787   SDValue Src0 = N->getSrc0();
23788   Src0 = ExtendToType(Src0, WideDataVT, DAG);
23789
23790   // Mask element has to be i1.
23791   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23792   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23793          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23794
23795   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23796
23797   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23798   if (MaskEltTy != MVT::i1)
23799     Mask = DAG.getNode(ISD::TRUNCATE, dl,
23800                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23801   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23802                                       N->getBasePtr(), Mask, Src0,
23803                                       N->getMemoryVT(), N->getMemOperand(),
23804                                       N->getExtensionType(),
23805                                       N->isExpandingLoad());
23806
23807   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23808                                NewLoad.getValue(0),
23809                                DAG.getIntPtrConstant(0, dl));
23810   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23811   return DAG.getMergeValues(RetOps, dl);
23812 }
23813
23814 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23815                            SelectionDAG &DAG) {
23816   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23817   SDValue DataToStore = N->getValue();
23818   MVT VT = DataToStore.getSimpleValueType();
23819   MVT ScalarVT = VT.getScalarType();
23820   SDValue Mask = N->getMask();
23821   SDLoc dl(Op);
23822
23823   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23824          "Expanding masked load is supported on AVX-512 target only!");
23825
23826   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23827          "Expanding masked load is supported for 32 and 64-bit types only!");
23828
23829   // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23830   if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23831     return Op;
23832
23833   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23834          "Cannot lower masked store op.");
23835
23836   assert((ScalarVT.getSizeInBits() >= 32 ||
23837           (Subtarget.hasBWI() &&
23838               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23839           "Unsupported masked store op.");
23840
23841   // This operation is legal for targets with VLX, but without
23842   // VLX the vector should be widened to 512 bit
23843   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23844   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23845
23846   // Mask element has to be i1.
23847   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23848   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23849          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23850
23851   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23852
23853   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23854   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23855   if (MaskEltTy != MVT::i1)
23856     Mask = DAG.getNode(ISD::TRUNCATE, dl,
23857                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23858   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23859                             Mask, N->getMemoryVT(), N->getMemOperand(),
23860                             N->isTruncatingStore(), N->isCompressingStore());
23861 }
23862
23863 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23864                             SelectionDAG &DAG) {
23865   assert(Subtarget.hasAVX512() &&
23866          "MGATHER/MSCATTER are supported on AVX-512 arch only");
23867
23868   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23869   SDLoc dl(Op);
23870   MVT VT = Op.getSimpleValueType();
23871   SDValue Index = N->getIndex();
23872   SDValue Mask = N->getMask();
23873   SDValue Src0 = N->getValue();
23874   MVT IndexVT = Index.getSimpleValueType();
23875   MVT MaskVT = Mask.getSimpleValueType();
23876
23877   unsigned NumElts = VT.getVectorNumElements();
23878   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23879
23880   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23881       !Index.getSimpleValueType().is512BitVector()) {
23882     // AVX512F supports only 512-bit vectors. Or data or index should
23883     // be 512 bit wide. If now the both index and data are 256-bit, but
23884     // the vector contains 8 elements, we just sign-extend the index
23885     if (NumElts == 8) {
23886       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23887       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
23888                         N->getOperand(3), Index };
23889       DAG.UpdateNodeOperands(N, Ops);
23890       return Op;
23891     }
23892
23893     // Minimal number of elements in Gather
23894     NumElts = 8;
23895     // Index
23896     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23897     Index = ExtendToType(Index, NewIndexVT, DAG);
23898     if (IndexVT.getScalarType() == MVT::i32)
23899       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23900
23901     // Mask
23902     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23903     // At this point we have promoted mask operand
23904     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23905     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23906     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23907     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23908
23909     // The pass-through value
23910     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23911     Src0 = ExtendToType(Src0, NewVT, DAG);
23912
23913     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23914     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23915                                             N->getMemoryVT(), dl, Ops,
23916                                             N->getMemOperand());
23917     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23918                                  NewGather.getValue(0),
23919                                  DAG.getIntPtrConstant(0, dl));
23920     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23921     return DAG.getMergeValues(RetOps, dl);
23922   }
23923   if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
23924     // There is a special case when the return type is v2i32 is illegal and
23925     // the type legaizer extended it to v2i64. Without this conversion we end up
23926     // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23927     // In order to avoid this situation, we'll build an X86 specific Gather node
23928     // with index v2i64 and value type v4i32.
23929     assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
23930            "Unexpected type in masked gather");
23931     Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23932                                 DAG.getBitcast(MVT::v4i32, Src0),
23933                                 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23934     // The mask should match the destination type. Extending mask with zeroes
23935     // is not necessary since instruction itself reads only two values from
23936     // memory.
23937     Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23938     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23939     SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23940       DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23941       N->getMemOperand());
23942
23943     SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23944                                   NewGather.getValue(0), DAG);
23945     SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23946     return DAG.getMergeValues(RetOps, dl);
23947   }
23948   if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
23949     // This transformation is for optimization only.
23950     // The type legalizer extended mask and index to 4 elements vector
23951     // in order to match requirements of the common gather node - same
23952     // vector width of index and value. X86 Gather node allows mismatch
23953     // of vector width in order to select more optimal instruction at the
23954     // end.
23955     assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
23956            "Unexpected type in masked gather");
23957     if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23958         ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23959         Index.getOpcode() == ISD::CONCAT_VECTORS &&
23960         Index.getOperand(1).isUndef()) {
23961       Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23962       Index = Index.getOperand(0);
23963     } else
23964       return Op;
23965     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23966     SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23967       DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23968       N->getMemOperand());
23969
23970     SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23971     return DAG.getMergeValues(RetOps, dl);
23972
23973   }
23974   return Op;
23975 }
23976
23977 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23978                                                     SelectionDAG &DAG) const {
23979   // TODO: Eventually, the lowering of these nodes should be informed by or
23980   // deferred to the GC strategy for the function in which they appear. For
23981   // now, however, they must be lowered to something. Since they are logically
23982   // no-ops in the case of a null GC strategy (or a GC strategy which does not
23983   // require special handling for these nodes), lower them as literal NOOPs for
23984   // the time being.
23985   SmallVector<SDValue, 2> Ops;
23986
23987   Ops.push_back(Op.getOperand(0));
23988   if (Op->getGluedNode())
23989     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23990
23991   SDLoc OpDL(Op);
23992   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23993   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23994
23995   return NOOP;
23996 }
23997
23998 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23999                                                   SelectionDAG &DAG) const {
24000   // TODO: Eventually, the lowering of these nodes should be informed by or
24001   // deferred to the GC strategy for the function in which they appear. For
24002   // now, however, they must be lowered to something. Since they are logically
24003   // no-ops in the case of a null GC strategy (or a GC strategy which does not
24004   // require special handling for these nodes), lower them as literal NOOPs for
24005   // the time being.
24006   SmallVector<SDValue, 2> Ops;
24007
24008   Ops.push_back(Op.getOperand(0));
24009   if (Op->getGluedNode())
24010     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24011
24012   SDLoc OpDL(Op);
24013   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24014   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24015
24016   return NOOP;
24017 }
24018
24019 /// Provide custom lowering hooks for some operations.
24020 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24021   switch (Op.getOpcode()) {
24022   default: llvm_unreachable("Should not custom lower this!");
24023   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24024   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24025     return LowerCMP_SWAP(Op, Subtarget, DAG);
24026   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
24027   case ISD::ATOMIC_LOAD_ADD:
24028   case ISD::ATOMIC_LOAD_SUB:
24029   case ISD::ATOMIC_LOAD_OR:
24030   case ISD::ATOMIC_LOAD_XOR:
24031   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
24032   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
24033   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
24034   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
24035   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24036   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
24037   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
24038   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24039   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
24040   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24041   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24042   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24043   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
24044   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
24045   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
24046   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
24047   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
24048   case ISD::SHL_PARTS:
24049   case ISD::SRA_PARTS:
24050   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
24051   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
24052   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
24053   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
24054   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
24055   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24056   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
24057   case ISD::ZERO_EXTEND_VECTOR_INREG:
24058   case ISD::SIGN_EXTEND_VECTOR_INREG:
24059     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24060   case ISD::FP_TO_SINT:
24061   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
24062   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
24063   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
24064   case ISD::FABS:
24065   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
24066   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
24067   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
24068   case ISD::SETCC:              return LowerSETCC(Op, DAG);
24069   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
24070   case ISD::SELECT:             return LowerSELECT(Op, DAG);
24071   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
24072   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
24073   case ISD::VASTART:            return LowerVASTART(Op, DAG);
24074   case ISD::VAARG:              return LowerVAARG(Op, DAG);
24075   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
24076   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
24077   case ISD::INTRINSIC_VOID:
24078   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24079   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
24080   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
24081   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
24082   case ISD::FRAME_TO_ARGS_OFFSET:
24083                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24084   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24085   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
24086   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
24087   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
24088   case ISD::EH_SJLJ_SETUP_DISPATCH:
24089     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24090   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
24091   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
24092   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
24093   case ISD::CTLZ:
24094   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
24095   case ISD::CTTZ:
24096   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
24097   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
24098   case ISD::MULHS:
24099   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
24100   case ISD::UMUL_LOHI:
24101   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
24102   case ISD::ROTL:
24103   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
24104   case ISD::SRA:
24105   case ISD::SRL:
24106   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
24107   case ISD::SADDO:
24108   case ISD::UADDO:
24109   case ISD::SSUBO:
24110   case ISD::USUBO:
24111   case ISD::SMULO:
24112   case ISD::UMULO:              return LowerXALUO(Op, DAG);
24113   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24114   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
24115   case ISD::ADDCARRY:
24116   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
24117   case ISD::ADD:
24118   case ISD::SUB:                return LowerADD_SUB(Op, DAG);
24119   case ISD::SMAX:
24120   case ISD::SMIN:
24121   case ISD::UMAX:
24122   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
24123   case ISD::ABS:                return LowerABS(Op, DAG);
24124   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
24125   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
24126   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
24127   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
24128   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
24129   case ISD::GC_TRANSITION_START:
24130                                 return LowerGC_TRANSITION_START(Op, DAG);
24131   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
24132   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
24133   }
24134 }
24135
24136 /// Places new result values for the node in Results (their number
24137 /// and types must exactly match those of the original return values of
24138 /// the node), or leaves Results empty, which indicates that the node is not
24139 /// to be custom lowered after all.
24140 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24141                                               SmallVectorImpl<SDValue> &Results,
24142                                               SelectionDAG &DAG) const {
24143   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24144
24145   if (!Res.getNode())
24146     return;
24147
24148   assert((N->getNumValues() <= Res->getNumValues()) &&
24149       "Lowering returned the wrong number of results!");
24150
24151   // Places new result values base on N result number.
24152   // In some cases (LowerSINT_TO_FP for example) Res has more result values
24153   // than original node, chain should be dropped(last value).
24154   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24155     Results.push_back(Res.getValue(I));
24156 }
24157
24158 /// Replace a node with an illegal result type with a new node built out of
24159 /// custom code.
24160 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24161                                            SmallVectorImpl<SDValue>&Results,
24162                                            SelectionDAG &DAG) const {
24163   SDLoc dl(N);
24164   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24165   switch (N->getOpcode()) {
24166   default:
24167     llvm_unreachable("Do not know how to custom type legalize this operation!");
24168   case X86ISD::AVG: {
24169     // Legalize types for X86ISD::AVG by expanding vectors.
24170     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24171
24172     auto InVT = N->getValueType(0);
24173     auto InVTSize = InVT.getSizeInBits();
24174     const unsigned RegSize =
24175         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24176     assert((Subtarget.hasBWI() || RegSize < 512) &&
24177            "512-bit vector requires AVX512BW");
24178     assert((Subtarget.hasAVX2() || RegSize < 256) &&
24179            "256-bit vector requires AVX2");
24180
24181     auto ElemVT = InVT.getVectorElementType();
24182     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24183                                   RegSize / ElemVT.getSizeInBits());
24184     assert(RegSize % InVT.getSizeInBits() == 0);
24185     unsigned NumConcat = RegSize / InVT.getSizeInBits();
24186
24187     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24188     Ops[0] = N->getOperand(0);
24189     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24190     Ops[0] = N->getOperand(1);
24191     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24192
24193     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24194     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24195                                   DAG.getIntPtrConstant(0, dl)));
24196     return;
24197   }
24198   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24199   case X86ISD::FMINC:
24200   case X86ISD::FMIN:
24201   case X86ISD::FMAXC:
24202   case X86ISD::FMAX: {
24203     EVT VT = N->getValueType(0);
24204     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24205     SDValue UNDEF = DAG.getUNDEF(VT);
24206     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24207                               N->getOperand(0), UNDEF);
24208     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24209                               N->getOperand(1), UNDEF);
24210     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24211     return;
24212   }
24213   case ISD::SDIV:
24214   case ISD::UDIV:
24215   case ISD::SREM:
24216   case ISD::UREM:
24217   case ISD::SDIVREM:
24218   case ISD::UDIVREM: {
24219     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24220     Results.push_back(V);
24221     return;
24222   }
24223   case ISD::FP_TO_SINT:
24224   case ISD::FP_TO_UINT: {
24225     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24226
24227     if (N->getValueType(0) == MVT::v2i32) {
24228       assert((IsSigned || Subtarget.hasAVX512()) &&
24229              "Can only handle signed conversion without AVX512");
24230       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24231       SDValue Src = N->getOperand(0);
24232       if (Src.getValueType() == MVT::v2f64) {
24233         SDValue Idx = DAG.getIntPtrConstant(0, dl);
24234         SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24235                                            : X86ISD::CVTTP2UI,
24236                                   dl, MVT::v4i32, Src);
24237         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24238         Results.push_back(Res);
24239         return;
24240       }
24241       if (Src.getValueType() == MVT::v2f32) {
24242         SDValue Idx = DAG.getIntPtrConstant(0, dl);
24243         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24244                                   DAG.getUNDEF(MVT::v2f32));
24245         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24246                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24247         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24248         Results.push_back(Res);
24249         return;
24250       }
24251
24252       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24253       // so early out here.
24254       return;
24255     }
24256
24257     std::pair<SDValue,SDValue> Vals =
24258         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24259     SDValue FIST = Vals.first, StackSlot = Vals.second;
24260     if (FIST.getNode()) {
24261       EVT VT = N->getValueType(0);
24262       // Return a load from the stack slot.
24263       if (StackSlot.getNode())
24264         Results.push_back(
24265             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24266       else
24267         Results.push_back(FIST);
24268     }
24269     return;
24270   }
24271   case ISD::SINT_TO_FP: {
24272     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24273     SDValue Src = N->getOperand(0);
24274     if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24275       return;
24276     Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24277     return;
24278   }
24279   case ISD::UINT_TO_FP: {
24280     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24281     EVT VT = N->getValueType(0);
24282     if (VT != MVT::v2f32)
24283       return;
24284     SDValue Src = N->getOperand(0);
24285     EVT SrcVT = Src.getValueType();
24286     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24287       Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24288       return;
24289     }
24290     if (SrcVT != MVT::v2i32)
24291       return;
24292     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24293     SDValue VBias =
24294         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24295     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24296                              DAG.getBitcast(MVT::v2i64, VBias));
24297     Or = DAG.getBitcast(MVT::v2f64, Or);
24298     // TODO: Are there any fast-math-flags to propagate here?
24299     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24300     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24301     return;
24302   }
24303   case ISD::FP_ROUND: {
24304     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24305         return;
24306     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24307     Results.push_back(V);
24308     return;
24309   }
24310   case ISD::FP_EXTEND: {
24311     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24312     // No other ValueType for FP_EXTEND should reach this point.
24313     assert(N->getValueType(0) == MVT::v2f32 &&
24314            "Do not know how to legalize this Node");
24315     return;
24316   }
24317   case ISD::INTRINSIC_W_CHAIN: {
24318     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24319     switch (IntNo) {
24320     default : llvm_unreachable("Do not know how to custom type "
24321                                "legalize this intrinsic operation!");
24322     case Intrinsic::x86_rdtsc:
24323       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24324                                      Results);
24325     case Intrinsic::x86_rdtscp:
24326       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24327                                      Results);
24328     case Intrinsic::x86_rdpmc:
24329       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24330
24331     case Intrinsic::x86_xgetbv:
24332       return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24333     }
24334   }
24335   case ISD::INTRINSIC_WO_CHAIN: {
24336     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24337       Results.push_back(V);
24338     return;
24339   }
24340   case ISD::READCYCLECOUNTER: {
24341     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24342                                    Results);
24343   }
24344   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24345     EVT T = N->getValueType(0);
24346     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24347     bool Regs64bit = T == MVT::i128;
24348     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24349     SDValue cpInL, cpInH;
24350     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24351                         DAG.getConstant(0, dl, HalfT));
24352     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24353                         DAG.getConstant(1, dl, HalfT));
24354     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24355                              Regs64bit ? X86::RAX : X86::EAX,
24356                              cpInL, SDValue());
24357     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24358                              Regs64bit ? X86::RDX : X86::EDX,
24359                              cpInH, cpInL.getValue(1));
24360     SDValue swapInL, swapInH;
24361     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24362                           DAG.getConstant(0, dl, HalfT));
24363     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24364                           DAG.getConstant(1, dl, HalfT));
24365     swapInH =
24366         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24367                          swapInH, cpInH.getValue(1));
24368     // If the current function needs the base pointer, RBX,
24369     // we shouldn't use cmpxchg directly.
24370     // Indeed the lowering of that instruction will clobber
24371     // that register and since RBX will be a reserved register
24372     // the register allocator will not make sure its value will
24373     // be properly saved and restored around this live-range.
24374     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24375     SDValue Result;
24376     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24377     unsigned BasePtr = TRI->getBaseRegister();
24378     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24379     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24380         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24381       // ISel prefers the LCMPXCHG64 variant.
24382       // If that assert breaks, that means it is not the case anymore,
24383       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24384       // not just EBX. This is a matter of accepting i64 input for that
24385       // pseudo, and restoring into the register of the right wide
24386       // in expand pseudo. Everything else should just work.
24387       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24388              "Saving only half of the RBX");
24389       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24390                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24391       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24392                                            Regs64bit ? X86::RBX : X86::EBX,
24393                                            HalfT, swapInH.getValue(1));
24394       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24395                        RBXSave,
24396                        /*Glue*/ RBXSave.getValue(2)};
24397       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24398     } else {
24399       unsigned Opcode =
24400           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24401       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24402                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
24403                                  swapInH.getValue(1));
24404       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24405                        swapInL.getValue(1)};
24406       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24407     }
24408     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24409                                         Regs64bit ? X86::RAX : X86::EAX,
24410                                         HalfT, Result.getValue(1));
24411     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24412                                         Regs64bit ? X86::RDX : X86::EDX,
24413                                         HalfT, cpOutL.getValue(2));
24414     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24415
24416     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24417                                         MVT::i32, cpOutH.getValue(2));
24418     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24419     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24420
24421     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24422     Results.push_back(Success);
24423     Results.push_back(EFLAGS.getValue(1));
24424     return;
24425   }
24426   case ISD::ATOMIC_SWAP:
24427   case ISD::ATOMIC_LOAD_ADD:
24428   case ISD::ATOMIC_LOAD_SUB:
24429   case ISD::ATOMIC_LOAD_AND:
24430   case ISD::ATOMIC_LOAD_OR:
24431   case ISD::ATOMIC_LOAD_XOR:
24432   case ISD::ATOMIC_LOAD_NAND:
24433   case ISD::ATOMIC_LOAD_MIN:
24434   case ISD::ATOMIC_LOAD_MAX:
24435   case ISD::ATOMIC_LOAD_UMIN:
24436   case ISD::ATOMIC_LOAD_UMAX:
24437   case ISD::ATOMIC_LOAD: {
24438     // Delegate to generic TypeLegalization. Situations we can really handle
24439     // should have already been dealt with by AtomicExpandPass.cpp.
24440     break;
24441   }
24442   case ISD::BITCAST: {
24443     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24444     EVT DstVT = N->getValueType(0);
24445     EVT SrcVT = N->getOperand(0)->getValueType(0);
24446
24447     if (SrcVT != MVT::f64 ||
24448         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24449       return;
24450
24451     unsigned NumElts = DstVT.getVectorNumElements();
24452     EVT SVT = DstVT.getVectorElementType();
24453     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24454     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24455                                    MVT::v2f64, N->getOperand(0));
24456     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24457
24458     if (ExperimentalVectorWideningLegalization) {
24459       // If we are legalizing vectors by widening, we already have the desired
24460       // legal vector type, just return it.
24461       Results.push_back(ToVecInt);
24462       return;
24463     }
24464
24465     SmallVector<SDValue, 8> Elts;
24466     for (unsigned i = 0, e = NumElts; i != e; ++i)
24467       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24468                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
24469
24470     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24471   }
24472   }
24473 }
24474
24475 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24476   switch ((X86ISD::NodeType)Opcode) {
24477   case X86ISD::FIRST_NUMBER:       break;
24478   case X86ISD::BSF:                return "X86ISD::BSF";
24479   case X86ISD::BSR:                return "X86ISD::BSR";
24480   case X86ISD::SHLD:               return "X86ISD::SHLD";
24481   case X86ISD::SHRD:               return "X86ISD::SHRD";
24482   case X86ISD::FAND:               return "X86ISD::FAND";
24483   case X86ISD::FANDN:              return "X86ISD::FANDN";
24484   case X86ISD::FOR:                return "X86ISD::FOR";
24485   case X86ISD::FXOR:               return "X86ISD::FXOR";
24486   case X86ISD::FILD:               return "X86ISD::FILD";
24487   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
24488   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24489   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24490   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24491   case X86ISD::FLD:                return "X86ISD::FLD";
24492   case X86ISD::FST:                return "X86ISD::FST";
24493   case X86ISD::CALL:               return "X86ISD::CALL";
24494   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
24495   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
24496   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
24497   case X86ISD::BT:                 return "X86ISD::BT";
24498   case X86ISD::CMP:                return "X86ISD::CMP";
24499   case X86ISD::COMI:               return "X86ISD::COMI";
24500   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
24501   case X86ISD::CMPM:               return "X86ISD::CMPM";
24502   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
24503   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
24504   case X86ISD::SETCC:              return "X86ISD::SETCC";
24505   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
24506   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
24507   case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
24508   case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
24509   case X86ISD::CMOV:               return "X86ISD::CMOV";
24510   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
24511   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
24512   case X86ISD::IRET:               return "X86ISD::IRET";
24513   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
24514   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
24515   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
24516   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
24517   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
24518   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
24519   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
24520   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
24521   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
24522   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
24523   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
24524   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
24525   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
24526   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
24527   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
24528   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
24529   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
24530   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
24531   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
24532   case X86ISD::HADD:               return "X86ISD::HADD";
24533   case X86ISD::HSUB:               return "X86ISD::HSUB";
24534   case X86ISD::FHADD:              return "X86ISD::FHADD";
24535   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
24536   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
24537   case X86ISD::FMAX:               return "X86ISD::FMAX";
24538   case X86ISD::FMAXS:              return "X86ISD::FMAXS";
24539   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
24540   case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
24541   case X86ISD::FMIN:               return "X86ISD::FMIN";
24542   case X86ISD::FMINS:              return "X86ISD::FMINS";
24543   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
24544   case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
24545   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
24546   case X86ISD::FMINC:              return "X86ISD::FMINC";
24547   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
24548   case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
24549   case X86ISD::FRCP:               return "X86ISD::FRCP";
24550   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
24551   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
24552   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
24553   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
24554   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
24555   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
24556   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
24557   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
24558   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24559     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24560   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
24561   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
24562   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
24563   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
24564   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
24565   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
24566   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
24567   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24568     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24569   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24570     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24571   case X86ISD::LADD:               return "X86ISD::LADD";
24572   case X86ISD::LSUB:               return "X86ISD::LSUB";
24573   case X86ISD::LOR:                return "X86ISD::LOR";
24574   case X86ISD::LXOR:               return "X86ISD::LXOR";
24575   case X86ISD::LAND:               return "X86ISD::LAND";
24576   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
24577   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
24578   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
24579   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
24580   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
24581   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
24582   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
24583   case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
24584   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
24585   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
24586   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
24587   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
24588   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
24589   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
24590   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
24591   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
24592   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
24593   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
24594   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
24595   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
24596   case X86ISD::VSHL:               return "X86ISD::VSHL";
24597   case X86ISD::VSRL:               return "X86ISD::VSRL";
24598   case X86ISD::VSRA:               return "X86ISD::VSRA";
24599   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
24600   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
24601   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
24602   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
24603   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
24604   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
24605   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
24606   case X86ISD::CMPP:               return "X86ISD::CMPP";
24607   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
24608   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
24609   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
24610   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
24611   case X86ISD::ADD:                return "X86ISD::ADD";
24612   case X86ISD::SUB:                return "X86ISD::SUB";
24613   case X86ISD::ADC:                return "X86ISD::ADC";
24614   case X86ISD::SBB:                return "X86ISD::SBB";
24615   case X86ISD::SMUL:               return "X86ISD::SMUL";
24616   case X86ISD::UMUL:               return "X86ISD::UMUL";
24617   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
24618   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
24619   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24620   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24621   case X86ISD::INC:                return "X86ISD::INC";
24622   case X86ISD::DEC:                return "X86ISD::DEC";
24623   case X86ISD::OR:                 return "X86ISD::OR";
24624   case X86ISD::XOR:                return "X86ISD::XOR";
24625   case X86ISD::AND:                return "X86ISD::AND";
24626   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
24627   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
24628   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
24629   case X86ISD::PTEST:              return "X86ISD::PTEST";
24630   case X86ISD::TESTP:              return "X86ISD::TESTP";
24631   case X86ISD::TESTM:              return "X86ISD::TESTM";
24632   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
24633   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
24634   case X86ISD::KTEST:              return "X86ISD::KTEST";
24635   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
24636   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
24637   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
24638   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
24639   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
24640   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
24641   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
24642   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
24643   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
24644   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
24645   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
24646   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
24647   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
24648   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
24649   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
24650   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
24651   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
24652   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
24653   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
24654   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
24655   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
24656   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
24657   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
24658   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
24659   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
24660   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
24661   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
24662   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
24663   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
24664   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
24665   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
24666   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
24667   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
24668   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
24669   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
24670   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
24671   case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
24672   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
24673   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
24674   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
24675   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
24676   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
24677   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24678   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
24679   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
24680   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
24681   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
24682   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
24683   case X86ISD::SAHF:               return "X86ISD::SAHF";
24684   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
24685   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
24686   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
24687   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
24688   case X86ISD::VPROT:              return "X86ISD::VPROT";
24689   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
24690   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
24691   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
24692   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
24693   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
24694   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
24695   case X86ISD::FMADD:              return "X86ISD::FMADD";
24696   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
24697   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
24698   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
24699   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
24700   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
24701   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
24702   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
24703   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
24704   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
24705   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
24706   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
24707   case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
24708   case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
24709   case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
24710   case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
24711   case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
24712   case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
24713   case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
24714   case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
24715   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
24716   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
24717   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
24718   case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
24719   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
24720   case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
24721   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
24722   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
24723   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
24724   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
24725   case X86ISD::XTEST:              return "X86ISD::XTEST";
24726   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
24727   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
24728   case X86ISD::SELECT:             return "X86ISD::SELECT";
24729   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
24730   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
24731   case X86ISD::RCP28:              return "X86ISD::RCP28";
24732   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
24733   case X86ISD::EXP2:               return "X86ISD::EXP2";
24734   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
24735   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
24736   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
24737   case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
24738   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
24739   case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
24740   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
24741   case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
24742   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
24743   case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
24744   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
24745   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
24746   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
24747   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
24748   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
24749   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
24750   case X86ISD::ADDS:               return "X86ISD::ADDS";
24751   case X86ISD::SUBS:               return "X86ISD::SUBS";
24752   case X86ISD::AVG:                return "X86ISD::AVG";
24753   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
24754   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
24755   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
24756   case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
24757   case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
24758   case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
24759   case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
24760   case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
24761   case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
24762   case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
24763   case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
24764   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
24765   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
24766   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
24767   case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24768   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24769   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
24770   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
24771   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
24772   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
24773   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
24774   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
24775   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
24776   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
24777   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
24778   case X86ISD::MGATHER:            return "X86ISD::MGATHER";
24779   }
24780   return nullptr;
24781 }
24782
24783 /// Return true if the addressing mode represented by AM is legal for this
24784 /// target, for a load/store of the specified type.
24785 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24786                                               const AddrMode &AM, Type *Ty,
24787                                               unsigned AS) const {
24788   // X86 supports extremely general addressing modes.
24789   CodeModel::Model M = getTargetMachine().getCodeModel();
24790
24791   // X86 allows a sign-extended 32-bit immediate field as a displacement.
24792   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24793     return false;
24794
24795   if (AM.BaseGV) {
24796     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24797
24798     // If a reference to this global requires an extra load, we can't fold it.
24799     if (isGlobalStubReference(GVFlags))
24800       return false;
24801
24802     // If BaseGV requires a register for the PIC base, we cannot also have a
24803     // BaseReg specified.
24804     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24805       return false;
24806
24807     // If lower 4G is not available, then we must use rip-relative addressing.
24808     if ((M != CodeModel::Small || isPositionIndependent()) &&
24809         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24810       return false;
24811   }
24812
24813   switch (AM.Scale) {
24814   case 0:
24815   case 1:
24816   case 2:
24817   case 4:
24818   case 8:
24819     // These scales always work.
24820     break;
24821   case 3:
24822   case 5:
24823   case 9:
24824     // These scales are formed with basereg+scalereg.  Only accept if there is
24825     // no basereg yet.
24826     if (AM.HasBaseReg)
24827       return false;
24828     break;
24829   default:  // Other stuff never works.
24830     return false;
24831   }
24832
24833   return true;
24834 }
24835
24836 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24837   unsigned Bits = Ty->getScalarSizeInBits();
24838
24839   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24840   // particularly cheaper than those without.
24841   if (Bits == 8)
24842     return false;
24843
24844   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24845   // variable shifts just as cheap as scalar ones.
24846   if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24847     return false;
24848
24849   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24850   // fully general vector.
24851   return true;
24852 }
24853
24854 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24855   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24856     return false;
24857   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24858   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24859   return NumBits1 > NumBits2;
24860 }
24861
24862 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24863   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24864     return false;
24865
24866   if (!isTypeLegal(EVT::getEVT(Ty1)))
24867     return false;
24868
24869   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24870
24871   // Assuming the caller doesn't have a zeroext or signext return parameter,
24872   // truncation all the way down to i1 is valid.
24873   return true;
24874 }
24875
24876 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24877   return isInt<32>(Imm);
24878 }
24879
24880 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24881   // Can also use sub to handle negated immediates.
24882   return isInt<32>(Imm);
24883 }
24884
24885 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24886   if (!VT1.isInteger() || !VT2.isInteger())
24887     return false;
24888   unsigned NumBits1 = VT1.getSizeInBits();
24889   unsigned NumBits2 = VT2.getSizeInBits();
24890   return NumBits1 > NumBits2;
24891 }
24892
24893 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24894   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24895   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24896 }
24897
24898 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24899   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24900   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24901 }
24902
24903 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24904   EVT VT1 = Val.getValueType();
24905   if (isZExtFree(VT1, VT2))
24906     return true;
24907
24908   if (Val.getOpcode() != ISD::LOAD)
24909     return false;
24910
24911   if (!VT1.isSimple() || !VT1.isInteger() ||
24912       !VT2.isSimple() || !VT2.isInteger())
24913     return false;
24914
24915   switch (VT1.getSimpleVT().SimpleTy) {
24916   default: break;
24917   case MVT::i8:
24918   case MVT::i16:
24919   case MVT::i32:
24920     // X86 has 8, 16, and 32-bit zero-extending loads.
24921     return true;
24922   }
24923
24924   return false;
24925 }
24926
24927 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24928
24929 bool
24930 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24931   if (!Subtarget.hasAnyFMA())
24932     return false;
24933
24934   VT = VT.getScalarType();
24935
24936   if (!VT.isSimple())
24937     return false;
24938
24939   switch (VT.getSimpleVT().SimpleTy) {
24940   case MVT::f32:
24941   case MVT::f64:
24942     return true;
24943   default:
24944     break;
24945   }
24946
24947   return false;
24948 }
24949
24950 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24951   // i16 instructions are longer (0x66 prefix) and potentially slower.
24952   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24953 }
24954
24955 /// Targets can use this to indicate that they only support *some*
24956 /// VECTOR_SHUFFLE operations, those with specific masks.
24957 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24958 /// are assumed to be legal.
24959 bool
24960 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24961                                       EVT VT) const {
24962   if (!VT.isSimple())
24963     return false;
24964
24965   // Not for i1 vectors
24966   if (VT.getSimpleVT().getScalarType() == MVT::i1)
24967     return false;
24968
24969   // Very little shuffling can be done for 64-bit vectors right now.
24970   if (VT.getSimpleVT().getSizeInBits() == 64)
24971     return false;
24972
24973   // We only care that the types being shuffled are legal. The lowering can
24974   // handle any possible shuffle mask that results.
24975   return isTypeLegal(VT.getSimpleVT());
24976 }
24977
24978 bool
24979 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24980                                           EVT VT) const {
24981   // Just delegate to the generic legality, clear masks aren't special.
24982   return isShuffleMaskLegal(Mask, VT);
24983 }
24984
24985 //===----------------------------------------------------------------------===//
24986 //                           X86 Scheduler Hooks
24987 //===----------------------------------------------------------------------===//
24988
24989 /// Utility function to emit xbegin specifying the start of an RTM region.
24990 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24991                                      const TargetInstrInfo *TII) {
24992   DebugLoc DL = MI.getDebugLoc();
24993
24994   const BasicBlock *BB = MBB->getBasicBlock();
24995   MachineFunction::iterator I = ++MBB->getIterator();
24996
24997   // For the v = xbegin(), we generate
24998   //
24999   // thisMBB:
25000   //  xbegin sinkMBB
25001   //
25002   // mainMBB:
25003   //  s0 = -1
25004   //
25005   // fallBB:
25006   //  eax = # XABORT_DEF
25007   //  s1 = eax
25008   //
25009   // sinkMBB:
25010   //  v = phi(s0/mainBB, s1/fallBB)
25011
25012   MachineBasicBlock *thisMBB = MBB;
25013   MachineFunction *MF = MBB->getParent();
25014   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25015   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25016   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25017   MF->insert(I, mainMBB);
25018   MF->insert(I, fallMBB);
25019   MF->insert(I, sinkMBB);
25020
25021   // Transfer the remainder of BB and its successor edges to sinkMBB.
25022   sinkMBB->splice(sinkMBB->begin(), MBB,
25023                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25024   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25025
25026   MachineRegisterInfo &MRI = MF->getRegInfo();
25027   unsigned DstReg = MI.getOperand(0).getReg();
25028   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25029   unsigned mainDstReg = MRI.createVirtualRegister(RC);
25030   unsigned fallDstReg = MRI.createVirtualRegister(RC);
25031
25032   // thisMBB:
25033   //  xbegin fallMBB
25034   //  # fallthrough to mainMBB
25035   //  # abortion to fallMBB
25036   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25037   thisMBB->addSuccessor(mainMBB);
25038   thisMBB->addSuccessor(fallMBB);
25039
25040   // mainMBB:
25041   //  mainDstReg := -1
25042   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25043   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25044   mainMBB->addSuccessor(sinkMBB);
25045
25046   // fallMBB:
25047   //  ; pseudo instruction to model hardware's definition from XABORT
25048   //  EAX := XABORT_DEF
25049   //  fallDstReg := EAX
25050   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25051   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25052       .addReg(X86::EAX);
25053   fallMBB->addSuccessor(sinkMBB);
25054
25055   // sinkMBB:
25056   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25057   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25058       .addReg(mainDstReg).addMBB(mainMBB)
25059       .addReg(fallDstReg).addMBB(fallMBB);
25060
25061   MI.eraseFromParent();
25062   return sinkMBB;
25063 }
25064
25065 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25066 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25067 // in the .td file.
25068 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25069                                        const TargetInstrInfo *TII) {
25070   unsigned Opc;
25071   switch (MI.getOpcode()) {
25072   default: llvm_unreachable("illegal opcode!");
25073   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
25074   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25075   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
25076   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25077   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
25078   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25079   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
25080   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25081   }
25082
25083   DebugLoc dl = MI.getDebugLoc();
25084   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25085
25086   unsigned NumArgs = MI.getNumOperands();
25087   for (unsigned i = 1; i < NumArgs; ++i) {
25088     MachineOperand &Op = MI.getOperand(i);
25089     if (!(Op.isReg() && Op.isImplicit()))
25090       MIB.add(Op);
25091   }
25092   if (MI.hasOneMemOperand())
25093     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25094
25095   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25096       .addReg(X86::XMM0);
25097
25098   MI.eraseFromParent();
25099   return BB;
25100 }
25101
25102 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25103 // defs in an instruction pattern
25104 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25105                                        const TargetInstrInfo *TII) {
25106   unsigned Opc;
25107   switch (MI.getOpcode()) {
25108   default: llvm_unreachable("illegal opcode!");
25109   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
25110   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25111   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
25112   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25113   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
25114   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25115   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
25116   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25117   }
25118
25119   DebugLoc dl = MI.getDebugLoc();
25120   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25121
25122   unsigned NumArgs = MI.getNumOperands(); // remove the results
25123   for (unsigned i = 1; i < NumArgs; ++i) {
25124     MachineOperand &Op = MI.getOperand(i);
25125     if (!(Op.isReg() && Op.isImplicit()))
25126       MIB.add(Op);
25127   }
25128   if (MI.hasOneMemOperand())
25129     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25130
25131   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25132       .addReg(X86::ECX);
25133
25134   MI.eraseFromParent();
25135   return BB;
25136 }
25137
25138 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25139                                      const X86Subtarget &Subtarget) {
25140   DebugLoc dl = MI.getDebugLoc();
25141   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25142
25143   // insert input VAL into EAX
25144   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25145       .addReg(MI.getOperand(0).getReg());
25146   // insert zero to ECX
25147   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25148
25149   // insert zero to EDX
25150   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25151
25152   // insert WRPKRU instruction
25153   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25154
25155   MI.eraseFromParent(); // The pseudo is gone now.
25156   return BB;
25157 }
25158
25159 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25160                                      const X86Subtarget &Subtarget) {
25161   DebugLoc dl = MI.getDebugLoc();
25162   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25163
25164   // insert zero to ECX
25165   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25166
25167   // insert RDPKRU instruction
25168   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25169   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25170       .addReg(X86::EAX);
25171
25172   MI.eraseFromParent(); // The pseudo is gone now.
25173   return BB;
25174 }
25175
25176 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25177                                       const X86Subtarget &Subtarget,
25178                                       unsigned Opc) {
25179   DebugLoc dl = MI.getDebugLoc();
25180   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25181   // Address into RAX/EAX, other two args into ECX, EDX.
25182   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25183   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25184   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25185   for (int i = 0; i < X86::AddrNumOperands; ++i)
25186     MIB.add(MI.getOperand(i));
25187
25188   unsigned ValOps = X86::AddrNumOperands;
25189   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25190       .addReg(MI.getOperand(ValOps).getReg());
25191   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25192       .addReg(MI.getOperand(ValOps + 1).getReg());
25193
25194   // The instruction doesn't actually take any operands though.
25195   BuildMI(*BB, MI, dl, TII->get(Opc));
25196
25197   MI.eraseFromParent(); // The pseudo is gone now.
25198   return BB;
25199 }
25200
25201 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25202                                       const X86Subtarget &Subtarget) {
25203   DebugLoc dl = MI->getDebugLoc();
25204   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25205   // Address into RAX/EAX
25206   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25207   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25208   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25209   for (int i = 0; i < X86::AddrNumOperands; ++i)
25210     MIB.add(MI->getOperand(i));
25211
25212   // The instruction doesn't actually take any operands though.
25213   BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25214
25215   MI->eraseFromParent(); // The pseudo is gone now.
25216   return BB;
25217 }
25218
25219
25220
25221 MachineBasicBlock *
25222 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25223                                                  MachineBasicBlock *MBB) const {
25224   // Emit va_arg instruction on X86-64.
25225
25226   // Operands to this pseudo-instruction:
25227   // 0  ) Output        : destination address (reg)
25228   // 1-5) Input         : va_list address (addr, i64mem)
25229   // 6  ) ArgSize       : Size (in bytes) of vararg type
25230   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25231   // 8  ) Align         : Alignment of type
25232   // 9  ) EFLAGS (implicit-def)
25233
25234   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25235   static_assert(X86::AddrNumOperands == 5,
25236                 "VAARG_64 assumes 5 address operands");
25237
25238   unsigned DestReg = MI.getOperand(0).getReg();
25239   MachineOperand &Base = MI.getOperand(1);
25240   MachineOperand &Scale = MI.getOperand(2);
25241   MachineOperand &Index = MI.getOperand(3);
25242   MachineOperand &Disp = MI.getOperand(4);
25243   MachineOperand &Segment = MI.getOperand(5);
25244   unsigned ArgSize = MI.getOperand(6).getImm();
25245   unsigned ArgMode = MI.getOperand(7).getImm();
25246   unsigned Align = MI.getOperand(8).getImm();
25247
25248   // Memory Reference
25249   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25250   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25251   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25252
25253   // Machine Information
25254   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25255   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25256   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25257   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25258   DebugLoc DL = MI.getDebugLoc();
25259
25260   // struct va_list {
25261   //   i32   gp_offset
25262   //   i32   fp_offset
25263   //   i64   overflow_area (address)
25264   //   i64   reg_save_area (address)
25265   // }
25266   // sizeof(va_list) = 24
25267   // alignment(va_list) = 8
25268
25269   unsigned TotalNumIntRegs = 6;
25270   unsigned TotalNumXMMRegs = 8;
25271   bool UseGPOffset = (ArgMode == 1);
25272   bool UseFPOffset = (ArgMode == 2);
25273   unsigned MaxOffset = TotalNumIntRegs * 8 +
25274                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25275
25276   /* Align ArgSize to a multiple of 8 */
25277   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25278   bool NeedsAlign = (Align > 8);
25279
25280   MachineBasicBlock *thisMBB = MBB;
25281   MachineBasicBlock *overflowMBB;
25282   MachineBasicBlock *offsetMBB;
25283   MachineBasicBlock *endMBB;
25284
25285   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
25286   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
25287   unsigned OffsetReg = 0;
25288
25289   if (!UseGPOffset && !UseFPOffset) {
25290     // If we only pull from the overflow region, we don't create a branch.
25291     // We don't need to alter control flow.
25292     OffsetDestReg = 0; // unused
25293     OverflowDestReg = DestReg;
25294
25295     offsetMBB = nullptr;
25296     overflowMBB = thisMBB;
25297     endMBB = thisMBB;
25298   } else {
25299     // First emit code to check if gp_offset (or fp_offset) is below the bound.
25300     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25301     // If not, pull from overflow_area. (branch to overflowMBB)
25302     //
25303     //       thisMBB
25304     //         |     .
25305     //         |        .
25306     //     offsetMBB   overflowMBB
25307     //         |        .
25308     //         |     .
25309     //        endMBB
25310
25311     // Registers for the PHI in endMBB
25312     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25313     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25314
25315     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25316     MachineFunction *MF = MBB->getParent();
25317     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25318     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25319     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25320
25321     MachineFunction::iterator MBBIter = ++MBB->getIterator();
25322
25323     // Insert the new basic blocks
25324     MF->insert(MBBIter, offsetMBB);
25325     MF->insert(MBBIter, overflowMBB);
25326     MF->insert(MBBIter, endMBB);
25327
25328     // Transfer the remainder of MBB and its successor edges to endMBB.
25329     endMBB->splice(endMBB->begin(), thisMBB,
25330                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25331     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25332
25333     // Make offsetMBB and overflowMBB successors of thisMBB
25334     thisMBB->addSuccessor(offsetMBB);
25335     thisMBB->addSuccessor(overflowMBB);
25336
25337     // endMBB is a successor of both offsetMBB and overflowMBB
25338     offsetMBB->addSuccessor(endMBB);
25339     overflowMBB->addSuccessor(endMBB);
25340
25341     // Load the offset value into a register
25342     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25343     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25344         .add(Base)
25345         .add(Scale)
25346         .add(Index)
25347         .addDisp(Disp, UseFPOffset ? 4 : 0)
25348         .add(Segment)
25349         .setMemRefs(MMOBegin, MMOEnd);
25350
25351     // Check if there is enough room left to pull this argument.
25352     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25353       .addReg(OffsetReg)
25354       .addImm(MaxOffset + 8 - ArgSizeA8);
25355
25356     // Branch to "overflowMBB" if offset >= max
25357     // Fall through to "offsetMBB" otherwise
25358     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25359       .addMBB(overflowMBB);
25360   }
25361
25362   // In offsetMBB, emit code to use the reg_save_area.
25363   if (offsetMBB) {
25364     assert(OffsetReg != 0);
25365
25366     // Read the reg_save_area address.
25367     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25368     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25369         .add(Base)
25370         .add(Scale)
25371         .add(Index)
25372         .addDisp(Disp, 16)
25373         .add(Segment)
25374         .setMemRefs(MMOBegin, MMOEnd);
25375
25376     // Zero-extend the offset
25377     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25378       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25379         .addImm(0)
25380         .addReg(OffsetReg)
25381         .addImm(X86::sub_32bit);
25382
25383     // Add the offset to the reg_save_area to get the final address.
25384     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25385       .addReg(OffsetReg64)
25386       .addReg(RegSaveReg);
25387
25388     // Compute the offset for the next argument
25389     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25390     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25391       .addReg(OffsetReg)
25392       .addImm(UseFPOffset ? 16 : 8);
25393
25394     // Store it back into the va_list.
25395     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25396         .add(Base)
25397         .add(Scale)
25398         .add(Index)
25399         .addDisp(Disp, UseFPOffset ? 4 : 0)
25400         .add(Segment)
25401         .addReg(NextOffsetReg)
25402         .setMemRefs(MMOBegin, MMOEnd);
25403
25404     // Jump to endMBB
25405     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25406       .addMBB(endMBB);
25407   }
25408
25409   //
25410   // Emit code to use overflow area
25411   //
25412
25413   // Load the overflow_area address into a register.
25414   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25415   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25416       .add(Base)
25417       .add(Scale)
25418       .add(Index)
25419       .addDisp(Disp, 8)
25420       .add(Segment)
25421       .setMemRefs(MMOBegin, MMOEnd);
25422
25423   // If we need to align it, do so. Otherwise, just copy the address
25424   // to OverflowDestReg.
25425   if (NeedsAlign) {
25426     // Align the overflow address
25427     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25428     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25429
25430     // aligned_addr = (addr + (align-1)) & ~(align-1)
25431     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25432       .addReg(OverflowAddrReg)
25433       .addImm(Align-1);
25434
25435     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25436       .addReg(TmpReg)
25437       .addImm(~(uint64_t)(Align-1));
25438   } else {
25439     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25440       .addReg(OverflowAddrReg);
25441   }
25442
25443   // Compute the next overflow address after this argument.
25444   // (the overflow address should be kept 8-byte aligned)
25445   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25446   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25447     .addReg(OverflowDestReg)
25448     .addImm(ArgSizeA8);
25449
25450   // Store the new overflow address.
25451   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25452       .add(Base)
25453       .add(Scale)
25454       .add(Index)
25455       .addDisp(Disp, 8)
25456       .add(Segment)
25457       .addReg(NextAddrReg)
25458       .setMemRefs(MMOBegin, MMOEnd);
25459
25460   // If we branched, emit the PHI to the front of endMBB.
25461   if (offsetMBB) {
25462     BuildMI(*endMBB, endMBB->begin(), DL,
25463             TII->get(X86::PHI), DestReg)
25464       .addReg(OffsetDestReg).addMBB(offsetMBB)
25465       .addReg(OverflowDestReg).addMBB(overflowMBB);
25466   }
25467
25468   // Erase the pseudo instruction
25469   MI.eraseFromParent();
25470
25471   return endMBB;
25472 }
25473
25474 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25475     MachineInstr &MI, MachineBasicBlock *MBB) const {
25476   // Emit code to save XMM registers to the stack. The ABI says that the
25477   // number of registers to save is given in %al, so it's theoretically
25478   // possible to do an indirect jump trick to avoid saving all of them,
25479   // however this code takes a simpler approach and just executes all
25480   // of the stores if %al is non-zero. It's less code, and it's probably
25481   // easier on the hardware branch predictor, and stores aren't all that
25482   // expensive anyway.
25483
25484   // Create the new basic blocks. One block contains all the XMM stores,
25485   // and one block is the final destination regardless of whether any
25486   // stores were performed.
25487   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25488   MachineFunction *F = MBB->getParent();
25489   MachineFunction::iterator MBBIter = ++MBB->getIterator();
25490   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25491   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25492   F->insert(MBBIter, XMMSaveMBB);
25493   F->insert(MBBIter, EndMBB);
25494
25495   // Transfer the remainder of MBB and its successor edges to EndMBB.
25496   EndMBB->splice(EndMBB->begin(), MBB,
25497                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25498   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25499
25500   // The original block will now fall through to the XMM save block.
25501   MBB->addSuccessor(XMMSaveMBB);
25502   // The XMMSaveMBB will fall through to the end block.
25503   XMMSaveMBB->addSuccessor(EndMBB);
25504
25505   // Now add the instructions.
25506   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25507   DebugLoc DL = MI.getDebugLoc();
25508
25509   unsigned CountReg = MI.getOperand(0).getReg();
25510   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25511   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25512
25513   if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25514     // If %al is 0, branch around the XMM save block.
25515     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25516     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25517     MBB->addSuccessor(EndMBB);
25518   }
25519
25520   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25521   // that was just emitted, but clearly shouldn't be "saved".
25522   assert((MI.getNumOperands() <= 3 ||
25523           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25524           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25525          "Expected last argument to be EFLAGS");
25526   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25527   // In the XMM save block, save all the XMM argument registers.
25528   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25529     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25530     MachineMemOperand *MMO = F->getMachineMemOperand(
25531         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25532         MachineMemOperand::MOStore,
25533         /*Size=*/16, /*Align=*/16);
25534     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25535         .addFrameIndex(RegSaveFrameIndex)
25536         .addImm(/*Scale=*/1)
25537         .addReg(/*IndexReg=*/0)
25538         .addImm(/*Disp=*/Offset)
25539         .addReg(/*Segment=*/0)
25540         .addReg(MI.getOperand(i).getReg())
25541         .addMemOperand(MMO);
25542   }
25543
25544   MI.eraseFromParent(); // The pseudo instruction is gone now.
25545
25546   return EndMBB;
25547 }
25548
25549 // The EFLAGS operand of SelectItr might be missing a kill marker
25550 // because there were multiple uses of EFLAGS, and ISel didn't know
25551 // which to mark. Figure out whether SelectItr should have had a
25552 // kill marker, and set it if it should. Returns the correct kill
25553 // marker value.
25554 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25555                                      MachineBasicBlock* BB,
25556                                      const TargetRegisterInfo* TRI) {
25557   // Scan forward through BB for a use/def of EFLAGS.
25558   MachineBasicBlock::iterator miI(std::next(SelectItr));
25559   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25560     const MachineInstr& mi = *miI;
25561     if (mi.readsRegister(X86::EFLAGS))
25562       return false;
25563     if (mi.definesRegister(X86::EFLAGS))
25564       break; // Should have kill-flag - update below.
25565   }
25566
25567   // If we hit the end of the block, check whether EFLAGS is live into a
25568   // successor.
25569   if (miI == BB->end()) {
25570     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25571                                           sEnd = BB->succ_end();
25572          sItr != sEnd; ++sItr) {
25573       MachineBasicBlock* succ = *sItr;
25574       if (succ->isLiveIn(X86::EFLAGS))
25575         return false;
25576     }
25577   }
25578
25579   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25580   // out. SelectMI should have a kill flag on EFLAGS.
25581   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25582   return true;
25583 }
25584
25585 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25586 // together with other CMOV pseudo-opcodes into a single basic-block with
25587 // conditional jump around it.
25588 static bool isCMOVPseudo(MachineInstr &MI) {
25589   switch (MI.getOpcode()) {
25590   case X86::CMOV_FR32:
25591   case X86::CMOV_FR64:
25592   case X86::CMOV_GR8:
25593   case X86::CMOV_GR16:
25594   case X86::CMOV_GR32:
25595   case X86::CMOV_RFP32:
25596   case X86::CMOV_RFP64:
25597   case X86::CMOV_RFP80:
25598   case X86::CMOV_V2F64:
25599   case X86::CMOV_V2I64:
25600   case X86::CMOV_V4F32:
25601   case X86::CMOV_V4F64:
25602   case X86::CMOV_V4I64:
25603   case X86::CMOV_V16F32:
25604   case X86::CMOV_V8F32:
25605   case X86::CMOV_V8F64:
25606   case X86::CMOV_V8I64:
25607   case X86::CMOV_V8I1:
25608   case X86::CMOV_V16I1:
25609   case X86::CMOV_V32I1:
25610   case X86::CMOV_V64I1:
25611     return true;
25612
25613   default:
25614     return false;
25615   }
25616 }
25617
25618 MachineBasicBlock *
25619 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25620                                      MachineBasicBlock *BB) const {
25621   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25622   DebugLoc DL = MI.getDebugLoc();
25623
25624   // To "insert" a SELECT_CC instruction, we actually have to insert the
25625   // diamond control-flow pattern.  The incoming instruction knows the
25626   // destination vreg to set, the condition code register to branch on, the
25627   // true/false values to select between, and a branch opcode to use.
25628   const BasicBlock *LLVM_BB = BB->getBasicBlock();
25629   MachineFunction::iterator It = ++BB->getIterator();
25630
25631   //  thisMBB:
25632   //  ...
25633   //   TrueVal = ...
25634   //   cmpTY ccX, r1, r2
25635   //   bCC copy1MBB
25636   //   fallthrough --> copy0MBB
25637   MachineBasicBlock *thisMBB = BB;
25638   MachineFunction *F = BB->getParent();
25639
25640   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25641   // as described above, by inserting a BB, and then making a PHI at the join
25642   // point to select the true and false operands of the CMOV in the PHI.
25643   //
25644   // The code also handles two different cases of multiple CMOV opcodes
25645   // in a row.
25646   //
25647   // Case 1:
25648   // In this case, there are multiple CMOVs in a row, all which are based on
25649   // the same condition setting (or the exact opposite condition setting).
25650   // In this case we can lower all the CMOVs using a single inserted BB, and
25651   // then make a number of PHIs at the join point to model the CMOVs. The only
25652   // trickiness here, is that in a case like:
25653   //
25654   // t2 = CMOV cond1 t1, f1
25655   // t3 = CMOV cond1 t2, f2
25656   //
25657   // when rewriting this into PHIs, we have to perform some renaming on the
25658   // temps since you cannot have a PHI operand refer to a PHI result earlier
25659   // in the same block.  The "simple" but wrong lowering would be:
25660   //
25661   // t2 = PHI t1(BB1), f1(BB2)
25662   // t3 = PHI t2(BB1), f2(BB2)
25663   //
25664   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25665   // renaming is to note that on the path through BB1, t2 is really just a
25666   // copy of t1, and do that renaming, properly generating:
25667   //
25668   // t2 = PHI t1(BB1), f1(BB2)
25669   // t3 = PHI t1(BB1), f2(BB2)
25670   //
25671   // Case 2, we lower cascaded CMOVs such as
25672   //
25673   //   (CMOV (CMOV F, T, cc1), T, cc2)
25674   //
25675   // to two successive branches.  For that, we look for another CMOV as the
25676   // following instruction.
25677   //
25678   // Without this, we would add a PHI between the two jumps, which ends up
25679   // creating a few copies all around. For instance, for
25680   //
25681   //    (sitofp (zext (fcmp une)))
25682   //
25683   // we would generate:
25684   //
25685   //         ucomiss %xmm1, %xmm0
25686   //         movss  <1.0f>, %xmm0
25687   //         movaps  %xmm0, %xmm1
25688   //         jne     .LBB5_2
25689   //         xorps   %xmm1, %xmm1
25690   // .LBB5_2:
25691   //         jp      .LBB5_4
25692   //         movaps  %xmm1, %xmm0
25693   // .LBB5_4:
25694   //         retq
25695   //
25696   // because this custom-inserter would have generated:
25697   //
25698   //   A
25699   //   | \
25700   //   |  B
25701   //   | /
25702   //   C
25703   //   | \
25704   //   |  D
25705   //   | /
25706   //   E
25707   //
25708   // A: X = ...; Y = ...
25709   // B: empty
25710   // C: Z = PHI [X, A], [Y, B]
25711   // D: empty
25712   // E: PHI [X, C], [Z, D]
25713   //
25714   // If we lower both CMOVs in a single step, we can instead generate:
25715   //
25716   //   A
25717   //   | \
25718   //   |  C
25719   //   | /|
25720   //   |/ |
25721   //   |  |
25722   //   |  D
25723   //   | /
25724   //   E
25725   //
25726   // A: X = ...; Y = ...
25727   // D: empty
25728   // E: PHI [X, A], [X, C], [Y, D]
25729   //
25730   // Which, in our sitofp/fcmp example, gives us something like:
25731   //
25732   //         ucomiss %xmm1, %xmm0
25733   //         movss  <1.0f>, %xmm0
25734   //         jne     .LBB5_4
25735   //         jp      .LBB5_4
25736   //         xorps   %xmm0, %xmm0
25737   // .LBB5_4:
25738   //         retq
25739   //
25740   MachineInstr *CascadedCMOV = nullptr;
25741   MachineInstr *LastCMOV = &MI;
25742   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25743   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25744   MachineBasicBlock::iterator NextMIIt =
25745       std::next(MachineBasicBlock::iterator(MI));
25746
25747   // Check for case 1, where there are multiple CMOVs with the same condition
25748   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
25749   // number of jumps the most.
25750
25751   if (isCMOVPseudo(MI)) {
25752     // See if we have a string of CMOVS with the same condition.
25753     while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25754            (NextMIIt->getOperand(3).getImm() == CC ||
25755             NextMIIt->getOperand(3).getImm() == OppCC)) {
25756       LastCMOV = &*NextMIIt;
25757       ++NextMIIt;
25758     }
25759   }
25760
25761   // This checks for case 2, but only do this if we didn't already find
25762   // case 1, as indicated by LastCMOV == MI.
25763   if (LastCMOV == &MI && NextMIIt != BB->end() &&
25764       NextMIIt->getOpcode() == MI.getOpcode() &&
25765       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25766       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25767       NextMIIt->getOperand(1).isKill()) {
25768     CascadedCMOV = &*NextMIIt;
25769   }
25770
25771   MachineBasicBlock *jcc1MBB = nullptr;
25772
25773   // If we have a cascaded CMOV, we lower it to two successive branches to
25774   // the same block.  EFLAGS is used by both, so mark it as live in the second.
25775   if (CascadedCMOV) {
25776     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25777     F->insert(It, jcc1MBB);
25778     jcc1MBB->addLiveIn(X86::EFLAGS);
25779   }
25780
25781   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25782   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25783   F->insert(It, copy0MBB);
25784   F->insert(It, sinkMBB);
25785
25786   // If the EFLAGS register isn't dead in the terminator, then claim that it's
25787   // live into the sink and copy blocks.
25788   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25789
25790   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25791   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25792       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25793     copy0MBB->addLiveIn(X86::EFLAGS);
25794     sinkMBB->addLiveIn(X86::EFLAGS);
25795   }
25796
25797   // Transfer the remainder of BB and its successor edges to sinkMBB.
25798   sinkMBB->splice(sinkMBB->begin(), BB,
25799                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25800   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25801
25802   // Add the true and fallthrough blocks as its successors.
25803   if (CascadedCMOV) {
25804     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25805     BB->addSuccessor(jcc1MBB);
25806
25807     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25808     // jump to the sinkMBB.
25809     jcc1MBB->addSuccessor(copy0MBB);
25810     jcc1MBB->addSuccessor(sinkMBB);
25811   } else {
25812     BB->addSuccessor(copy0MBB);
25813   }
25814
25815   // The true block target of the first (or only) branch is always sinkMBB.
25816   BB->addSuccessor(sinkMBB);
25817
25818   // Create the conditional branch instruction.
25819   unsigned Opc = X86::GetCondBranchFromCond(CC);
25820   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25821
25822   if (CascadedCMOV) {
25823     unsigned Opc2 = X86::GetCondBranchFromCond(
25824         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25825     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25826   }
25827
25828   //  copy0MBB:
25829   //   %FalseValue = ...
25830   //   # fallthrough to sinkMBB
25831   copy0MBB->addSuccessor(sinkMBB);
25832
25833   //  sinkMBB:
25834   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25835   //  ...
25836   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25837   MachineBasicBlock::iterator MIItEnd =
25838     std::next(MachineBasicBlock::iterator(LastCMOV));
25839   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25840   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25841   MachineInstrBuilder MIB;
25842
25843   // As we are creating the PHIs, we have to be careful if there is more than
25844   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
25845   // PHIs have to reference the individual true/false inputs from earlier PHIs.
25846   // That also means that PHI construction must work forward from earlier to
25847   // later, and that the code must maintain a mapping from earlier PHI's
25848   // destination registers, and the registers that went into the PHI.
25849
25850   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25851     unsigned DestReg = MIIt->getOperand(0).getReg();
25852     unsigned Op1Reg = MIIt->getOperand(1).getReg();
25853     unsigned Op2Reg = MIIt->getOperand(2).getReg();
25854
25855     // If this CMOV we are generating is the opposite condition from
25856     // the jump we generated, then we have to swap the operands for the
25857     // PHI that is going to be generated.
25858     if (MIIt->getOperand(3).getImm() == OppCC)
25859         std::swap(Op1Reg, Op2Reg);
25860
25861     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25862       Op1Reg = RegRewriteTable[Op1Reg].first;
25863
25864     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25865       Op2Reg = RegRewriteTable[Op2Reg].second;
25866
25867     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25868                   TII->get(X86::PHI), DestReg)
25869           .addReg(Op1Reg).addMBB(copy0MBB)
25870           .addReg(Op2Reg).addMBB(thisMBB);
25871
25872     // Add this PHI to the rewrite table.
25873     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25874   }
25875
25876   // If we have a cascaded CMOV, the second Jcc provides the same incoming
25877   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25878   if (CascadedCMOV) {
25879     MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25880     // Copy the PHI result to the register defined by the second CMOV.
25881     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25882             DL, TII->get(TargetOpcode::COPY),
25883             CascadedCMOV->getOperand(0).getReg())
25884         .addReg(MI.getOperand(0).getReg());
25885     CascadedCMOV->eraseFromParent();
25886   }
25887
25888   // Now remove the CMOV(s).
25889   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25890     (MIIt++)->eraseFromParent();
25891
25892   return sinkMBB;
25893 }
25894
25895 MachineBasicBlock *
25896 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25897                                        MachineBasicBlock *BB) const {
25898   // Combine the following atomic floating-point modification pattern:
25899   //   a.store(reg OP a.load(acquire), release)
25900   // Transform them into:
25901   //   OPss (%gpr), %xmm
25902   //   movss %xmm, (%gpr)
25903   // Or sd equivalent for 64-bit operations.
25904   unsigned MOp, FOp;
25905   switch (MI.getOpcode()) {
25906   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25907   case X86::RELEASE_FADD32mr:
25908     FOp = X86::ADDSSrm;
25909     MOp = X86::MOVSSmr;
25910     break;
25911   case X86::RELEASE_FADD64mr:
25912     FOp = X86::ADDSDrm;
25913     MOp = X86::MOVSDmr;
25914     break;
25915   }
25916   const X86InstrInfo *TII = Subtarget.getInstrInfo();
25917   DebugLoc DL = MI.getDebugLoc();
25918   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25919   unsigned ValOpIdx = X86::AddrNumOperands;
25920   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25921   MachineInstrBuilder MIB =
25922       BuildMI(*BB, MI, DL, TII->get(FOp),
25923               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25924           .addReg(VSrc);
25925   for (int i = 0; i < X86::AddrNumOperands; ++i) {
25926     MachineOperand &Operand = MI.getOperand(i);
25927     // Clear any kill flags on register operands as we'll create a second
25928     // instruction using the same address operands.
25929     if (Operand.isReg())
25930       Operand.setIsKill(false);
25931     MIB.add(Operand);
25932   }
25933   MachineInstr *FOpMI = MIB;
25934   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25935   for (int i = 0; i < X86::AddrNumOperands; ++i)
25936     MIB.add(MI.getOperand(i));
25937   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25938   MI.eraseFromParent(); // The pseudo instruction is gone now.
25939   return BB;
25940 }
25941
25942 MachineBasicBlock *
25943 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25944                                         MachineBasicBlock *BB) const {
25945   MachineFunction *MF = BB->getParent();
25946   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25947   DebugLoc DL = MI.getDebugLoc();
25948   const BasicBlock *LLVM_BB = BB->getBasicBlock();
25949
25950   assert(MF->shouldSplitStack());
25951
25952   const bool Is64Bit = Subtarget.is64Bit();
25953   const bool IsLP64 = Subtarget.isTarget64BitLP64();
25954
25955   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25956   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25957
25958   // BB:
25959   //  ... [Till the alloca]
25960   // If stacklet is not large enough, jump to mallocMBB
25961   //
25962   // bumpMBB:
25963   //  Allocate by subtracting from RSP
25964   //  Jump to continueMBB
25965   //
25966   // mallocMBB:
25967   //  Allocate by call to runtime
25968   //
25969   // continueMBB:
25970   //  ...
25971   //  [rest of original BB]
25972   //
25973
25974   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25975   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25976   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25977
25978   MachineRegisterInfo &MRI = MF->getRegInfo();
25979   const TargetRegisterClass *AddrRegClass =
25980       getRegClassFor(getPointerTy(MF->getDataLayout()));
25981
25982   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25983            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25984            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25985            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25986            sizeVReg = MI.getOperand(1).getReg(),
25987            physSPReg =
25988                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25989
25990   MachineFunction::iterator MBBIter = ++BB->getIterator();
25991
25992   MF->insert(MBBIter, bumpMBB);
25993   MF->insert(MBBIter, mallocMBB);
25994   MF->insert(MBBIter, continueMBB);
25995
25996   continueMBB->splice(continueMBB->begin(), BB,
25997                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
25998   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25999
26000   // Add code to the main basic block to check if the stack limit has been hit,
26001   // and if so, jump to mallocMBB otherwise to bumpMBB.
26002   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26003   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26004     .addReg(tmpSPVReg).addReg(sizeVReg);
26005   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26006     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26007     .addReg(SPLimitVReg);
26008   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26009
26010   // bumpMBB simply decreases the stack pointer, since we know the current
26011   // stacklet has enough space.
26012   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26013     .addReg(SPLimitVReg);
26014   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26015     .addReg(SPLimitVReg);
26016   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26017
26018   // Calls into a routine in libgcc to allocate more space from the heap.
26019   const uint32_t *RegMask =
26020       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26021   if (IsLP64) {
26022     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26023       .addReg(sizeVReg);
26024     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26025       .addExternalSymbol("__morestack_allocate_stack_space")
26026       .addRegMask(RegMask)
26027       .addReg(X86::RDI, RegState::Implicit)
26028       .addReg(X86::RAX, RegState::ImplicitDefine);
26029   } else if (Is64Bit) {
26030     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26031       .addReg(sizeVReg);
26032     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26033       .addExternalSymbol("__morestack_allocate_stack_space")
26034       .addRegMask(RegMask)
26035       .addReg(X86::EDI, RegState::Implicit)
26036       .addReg(X86::EAX, RegState::ImplicitDefine);
26037   } else {
26038     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26039       .addImm(12);
26040     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26041     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26042       .addExternalSymbol("__morestack_allocate_stack_space")
26043       .addRegMask(RegMask)
26044       .addReg(X86::EAX, RegState::ImplicitDefine);
26045   }
26046
26047   if (!Is64Bit)
26048     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26049       .addImm(16);
26050
26051   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26052     .addReg(IsLP64 ? X86::RAX : X86::EAX);
26053   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26054
26055   // Set up the CFG correctly.
26056   BB->addSuccessor(bumpMBB);
26057   BB->addSuccessor(mallocMBB);
26058   mallocMBB->addSuccessor(continueMBB);
26059   bumpMBB->addSuccessor(continueMBB);
26060
26061   // Take care of the PHI nodes.
26062   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26063           MI.getOperand(0).getReg())
26064       .addReg(mallocPtrVReg)
26065       .addMBB(mallocMBB)
26066       .addReg(bumpSPPtrVReg)
26067       .addMBB(bumpMBB);
26068
26069   // Delete the original pseudo instruction.
26070   MI.eraseFromParent();
26071
26072   // And we're done.
26073   return continueMBB;
26074 }
26075
26076 MachineBasicBlock *
26077 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26078                                        MachineBasicBlock *BB) const {
26079   MachineFunction *MF = BB->getParent();
26080   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26081   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26082   DebugLoc DL = MI.getDebugLoc();
26083
26084   assert(!isAsynchronousEHPersonality(
26085              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26086          "SEH does not use catchret!");
26087
26088   // Only 32-bit EH needs to worry about manually restoring stack pointers.
26089   if (!Subtarget.is32Bit())
26090     return BB;
26091
26092   // C++ EH creates a new target block to hold the restore code, and wires up
26093   // the new block to the return destination with a normal JMP_4.
26094   MachineBasicBlock *RestoreMBB =
26095       MF->CreateMachineBasicBlock(BB->getBasicBlock());
26096   assert(BB->succ_size() == 1);
26097   MF->insert(std::next(BB->getIterator()), RestoreMBB);
26098   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26099   BB->addSuccessor(RestoreMBB);
26100   MI.getOperand(0).setMBB(RestoreMBB);
26101
26102   auto RestoreMBBI = RestoreMBB->begin();
26103   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26104   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26105   return BB;
26106 }
26107
26108 MachineBasicBlock *
26109 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26110                                        MachineBasicBlock *BB) const {
26111   MachineFunction *MF = BB->getParent();
26112   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26113   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26114   // Only 32-bit SEH requires special handling for catchpad.
26115   if (IsSEH && Subtarget.is32Bit()) {
26116     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26117     DebugLoc DL = MI.getDebugLoc();
26118     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26119   }
26120   MI.eraseFromParent();
26121   return BB;
26122 }
26123
26124 MachineBasicBlock *
26125 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26126                                       MachineBasicBlock *BB) const {
26127   // So, here we replace TLSADDR with the sequence:
26128   // adjust_stackdown -> TLSADDR -> adjust_stackup.
26129   // We need this because TLSADDR is lowered into calls
26130   // inside MC, therefore without the two markers shrink-wrapping
26131   // may push the prologue/epilogue pass them.
26132   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26133   DebugLoc DL = MI.getDebugLoc();
26134   MachineFunction &MF = *BB->getParent();
26135
26136   // Emit CALLSEQ_START right before the instruction.
26137   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26138   MachineInstrBuilder CallseqStart =
26139     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26140   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26141
26142   // Emit CALLSEQ_END right after the instruction.
26143   // We don't call erase from parent because we want to keep the
26144   // original instruction around.
26145   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26146   MachineInstrBuilder CallseqEnd =
26147     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26148   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26149
26150   return BB;
26151 }
26152
26153 MachineBasicBlock *
26154 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26155                                       MachineBasicBlock *BB) const {
26156   // This is pretty easy.  We're taking the value that we received from
26157   // our load from the relocation, sticking it in either RDI (x86-64)
26158   // or EAX and doing an indirect call.  The return value will then
26159   // be in the normal return register.
26160   MachineFunction *F = BB->getParent();
26161   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26162   DebugLoc DL = MI.getDebugLoc();
26163
26164   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26165   assert(MI.getOperand(3).isGlobal() && "This should be a global");
26166
26167   // Get a register mask for the lowered call.
26168   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26169   // proper register mask.
26170   const uint32_t *RegMask =
26171       Subtarget.is64Bit() ?
26172       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26173       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26174   if (Subtarget.is64Bit()) {
26175     MachineInstrBuilder MIB =
26176         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26177             .addReg(X86::RIP)
26178             .addImm(0)
26179             .addReg(0)
26180             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26181                               MI.getOperand(3).getTargetFlags())
26182             .addReg(0);
26183     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26184     addDirectMem(MIB, X86::RDI);
26185     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26186   } else if (!isPositionIndependent()) {
26187     MachineInstrBuilder MIB =
26188         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26189             .addReg(0)
26190             .addImm(0)
26191             .addReg(0)
26192             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26193                               MI.getOperand(3).getTargetFlags())
26194             .addReg(0);
26195     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26196     addDirectMem(MIB, X86::EAX);
26197     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26198   } else {
26199     MachineInstrBuilder MIB =
26200         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26201             .addReg(TII->getGlobalBaseReg(F))
26202             .addImm(0)
26203             .addReg(0)
26204             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26205                               MI.getOperand(3).getTargetFlags())
26206             .addReg(0);
26207     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26208     addDirectMem(MIB, X86::EAX);
26209     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26210   }
26211
26212   MI.eraseFromParent(); // The pseudo instruction is gone now.
26213   return BB;
26214 }
26215
26216 MachineBasicBlock *
26217 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26218                                     MachineBasicBlock *MBB) const {
26219   DebugLoc DL = MI.getDebugLoc();
26220   MachineFunction *MF = MBB->getParent();
26221   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26222   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26223   MachineRegisterInfo &MRI = MF->getRegInfo();
26224
26225   const BasicBlock *BB = MBB->getBasicBlock();
26226   MachineFunction::iterator I = ++MBB->getIterator();
26227
26228   // Memory Reference
26229   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26230   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26231
26232   unsigned DstReg;
26233   unsigned MemOpndSlot = 0;
26234
26235   unsigned CurOp = 0;
26236
26237   DstReg = MI.getOperand(CurOp++).getReg();
26238   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26239   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26240   (void)TRI;
26241   unsigned mainDstReg = MRI.createVirtualRegister(RC);
26242   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26243
26244   MemOpndSlot = CurOp;
26245
26246   MVT PVT = getPointerTy(MF->getDataLayout());
26247   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26248          "Invalid Pointer Size!");
26249
26250   // For v = setjmp(buf), we generate
26251   //
26252   // thisMBB:
26253   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26254   //  SjLjSetup restoreMBB
26255   //
26256   // mainMBB:
26257   //  v_main = 0
26258   //
26259   // sinkMBB:
26260   //  v = phi(main, restore)
26261   //
26262   // restoreMBB:
26263   //  if base pointer being used, load it from frame
26264   //  v_restore = 1
26265
26266   MachineBasicBlock *thisMBB = MBB;
26267   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26268   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26269   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26270   MF->insert(I, mainMBB);
26271   MF->insert(I, sinkMBB);
26272   MF->push_back(restoreMBB);
26273   restoreMBB->setHasAddressTaken();
26274
26275   MachineInstrBuilder MIB;
26276
26277   // Transfer the remainder of BB and its successor edges to sinkMBB.
26278   sinkMBB->splice(sinkMBB->begin(), MBB,
26279                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26280   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26281
26282   // thisMBB:
26283   unsigned PtrStoreOpc = 0;
26284   unsigned LabelReg = 0;
26285   const int64_t LabelOffset = 1 * PVT.getStoreSize();
26286   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26287                      !isPositionIndependent();
26288
26289   // Prepare IP either in reg or imm.
26290   if (!UseImmLabel) {
26291     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26292     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26293     LabelReg = MRI.createVirtualRegister(PtrRC);
26294     if (Subtarget.is64Bit()) {
26295       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26296               .addReg(X86::RIP)
26297               .addImm(0)
26298               .addReg(0)
26299               .addMBB(restoreMBB)
26300               .addReg(0);
26301     } else {
26302       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26303       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26304               .addReg(XII->getGlobalBaseReg(MF))
26305               .addImm(0)
26306               .addReg(0)
26307               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26308               .addReg(0);
26309     }
26310   } else
26311     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26312   // Store IP
26313   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26314   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26315     if (i == X86::AddrDisp)
26316       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26317     else
26318       MIB.add(MI.getOperand(MemOpndSlot + i));
26319   }
26320   if (!UseImmLabel)
26321     MIB.addReg(LabelReg);
26322   else
26323     MIB.addMBB(restoreMBB);
26324   MIB.setMemRefs(MMOBegin, MMOEnd);
26325   // Setup
26326   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26327           .addMBB(restoreMBB);
26328
26329   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26330   MIB.addRegMask(RegInfo->getNoPreservedMask());
26331   thisMBB->addSuccessor(mainMBB);
26332   thisMBB->addSuccessor(restoreMBB);
26333
26334   // mainMBB:
26335   //  EAX = 0
26336   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26337   mainMBB->addSuccessor(sinkMBB);
26338
26339   // sinkMBB:
26340   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26341           TII->get(X86::PHI), DstReg)
26342     .addReg(mainDstReg).addMBB(mainMBB)
26343     .addReg(restoreDstReg).addMBB(restoreMBB);
26344
26345   // restoreMBB:
26346   if (RegInfo->hasBasePointer(*MF)) {
26347     const bool Uses64BitFramePtr =
26348         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26349     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26350     X86FI->setRestoreBasePointer(MF);
26351     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26352     unsigned BasePtr = RegInfo->getBaseRegister();
26353     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26354     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26355                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
26356       .setMIFlag(MachineInstr::FrameSetup);
26357   }
26358   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26359   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26360   restoreMBB->addSuccessor(sinkMBB);
26361
26362   MI.eraseFromParent();
26363   return sinkMBB;
26364 }
26365
26366 MachineBasicBlock *
26367 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26368                                      MachineBasicBlock *MBB) const {
26369   DebugLoc DL = MI.getDebugLoc();
26370   MachineFunction *MF = MBB->getParent();
26371   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26372   MachineRegisterInfo &MRI = MF->getRegInfo();
26373
26374   // Memory Reference
26375   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26376   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26377
26378   MVT PVT = getPointerTy(MF->getDataLayout());
26379   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26380          "Invalid Pointer Size!");
26381
26382   const TargetRegisterClass *RC =
26383     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26384   unsigned Tmp = MRI.createVirtualRegister(RC);
26385   // Since FP is only updated here but NOT referenced, it's treated as GPR.
26386   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26387   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26388   unsigned SP = RegInfo->getStackRegister();
26389
26390   MachineInstrBuilder MIB;
26391
26392   const int64_t LabelOffset = 1 * PVT.getStoreSize();
26393   const int64_t SPOffset = 2 * PVT.getStoreSize();
26394
26395   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26396   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26397
26398   // Reload FP
26399   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26400   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26401     MIB.add(MI.getOperand(i));
26402   MIB.setMemRefs(MMOBegin, MMOEnd);
26403   // Reload IP
26404   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26405   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26406     if (i == X86::AddrDisp)
26407       MIB.addDisp(MI.getOperand(i), LabelOffset);
26408     else
26409       MIB.add(MI.getOperand(i));
26410   }
26411   MIB.setMemRefs(MMOBegin, MMOEnd);
26412   // Reload SP
26413   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26414   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26415     if (i == X86::AddrDisp)
26416       MIB.addDisp(MI.getOperand(i), SPOffset);
26417     else
26418       MIB.add(MI.getOperand(i));
26419   }
26420   MIB.setMemRefs(MMOBegin, MMOEnd);
26421   // Jump
26422   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26423
26424   MI.eraseFromParent();
26425   return MBB;
26426 }
26427
26428 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26429                                                MachineBasicBlock *MBB,
26430                                                MachineBasicBlock *DispatchBB,
26431                                                int FI) const {
26432   DebugLoc DL = MI.getDebugLoc();
26433   MachineFunction *MF = MBB->getParent();
26434   MachineRegisterInfo *MRI = &MF->getRegInfo();
26435   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26436
26437   MVT PVT = getPointerTy(MF->getDataLayout());
26438   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26439
26440   unsigned Op = 0;
26441   unsigned VR = 0;
26442
26443   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26444                      !isPositionIndependent();
26445
26446   if (UseImmLabel) {
26447     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26448   } else {
26449     const TargetRegisterClass *TRC =
26450         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26451     VR = MRI->createVirtualRegister(TRC);
26452     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26453
26454     if (Subtarget.is64Bit())
26455       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26456           .addReg(X86::RIP)
26457           .addImm(1)
26458           .addReg(0)
26459           .addMBB(DispatchBB)
26460           .addReg(0);
26461     else
26462       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26463           .addReg(0) /* TII->getGlobalBaseReg(MF) */
26464           .addImm(1)
26465           .addReg(0)
26466           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26467           .addReg(0);
26468   }
26469
26470   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26471   addFrameReference(MIB, FI, 36);
26472   if (UseImmLabel)
26473     MIB.addMBB(DispatchBB);
26474   else
26475     MIB.addReg(VR);
26476 }
26477
26478 MachineBasicBlock *
26479 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26480                                          MachineBasicBlock *BB) const {
26481   DebugLoc DL = MI.getDebugLoc();
26482   MachineFunction *MF = BB->getParent();
26483   MachineFrameInfo &MFI = MF->getFrameInfo();
26484   MachineRegisterInfo *MRI = &MF->getRegInfo();
26485   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26486   int FI = MFI.getFunctionContextIndex();
26487
26488   // Get a mapping of the call site numbers to all of the landing pads they're
26489   // associated with.
26490   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26491   unsigned MaxCSNum = 0;
26492   for (auto &MBB : *MF) {
26493     if (!MBB.isEHPad())
26494       continue;
26495
26496     MCSymbol *Sym = nullptr;
26497     for (const auto &MI : MBB) {
26498       if (MI.isDebugValue())
26499         continue;
26500
26501       assert(MI.isEHLabel() && "expected EH_LABEL");
26502       Sym = MI.getOperand(0).getMCSymbol();
26503       break;
26504     }
26505
26506     if (!MF->hasCallSiteLandingPad(Sym))
26507       continue;
26508
26509     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26510       CallSiteNumToLPad[CSI].push_back(&MBB);
26511       MaxCSNum = std::max(MaxCSNum, CSI);
26512     }
26513   }
26514
26515   // Get an ordered list of the machine basic blocks for the jump table.
26516   std::vector<MachineBasicBlock *> LPadList;
26517   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26518   LPadList.reserve(CallSiteNumToLPad.size());
26519
26520   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26521     for (auto &LP : CallSiteNumToLPad[CSI]) {
26522       LPadList.push_back(LP);
26523       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26524     }
26525   }
26526
26527   assert(!LPadList.empty() &&
26528          "No landing pad destinations for the dispatch jump table!");
26529
26530   // Create the MBBs for the dispatch code.
26531
26532   // Shove the dispatch's address into the return slot in the function context.
26533   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26534   DispatchBB->setIsEHPad(true);
26535
26536   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26537   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26538   DispatchBB->addSuccessor(TrapBB);
26539
26540   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26541   DispatchBB->addSuccessor(DispContBB);
26542
26543   // Insert MBBs.
26544   MF->push_back(DispatchBB);
26545   MF->push_back(DispContBB);
26546   MF->push_back(TrapBB);
26547
26548   // Insert code into the entry block that creates and registers the function
26549   // context.
26550   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26551
26552   // Create the jump table and associated information
26553   MachineJumpTableInfo *JTI =
26554       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26555   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26556
26557   const X86RegisterInfo &RI = TII->getRegisterInfo();
26558   // Add a register mask with no preserved registers.  This results in all
26559   // registers being marked as clobbered.
26560   if (RI.hasBasePointer(*MF)) {
26561     const bool FPIs64Bit =
26562         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26563     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26564     MFI->setRestoreBasePointer(MF);
26565
26566     unsigned FP = RI.getFrameRegister(*MF);
26567     unsigned BP = RI.getBaseRegister();
26568     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26569     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26570                  MFI->getRestoreBasePointerOffset())
26571         .addRegMask(RI.getNoPreservedMask());
26572   } else {
26573     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26574         .addRegMask(RI.getNoPreservedMask());
26575   }
26576
26577   unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26578   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26579                     4);
26580   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26581       .addReg(IReg)
26582       .addImm(LPadList.size());
26583   BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26584
26585   unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26586   BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26587       .addReg(IReg)
26588       .addImm(1);
26589   BuildMI(DispContBB, DL,
26590           TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26591       .addReg(0)
26592       .addImm(Subtarget.is64Bit() ? 8 : 4)
26593       .addReg(JReg)
26594       .addJumpTableIndex(MJTI)
26595       .addReg(0);
26596
26597   // Add the jump table entries as successors to the MBB.
26598   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26599   for (auto &LP : LPadList)
26600     if (SeenMBBs.insert(LP).second)
26601       DispContBB->addSuccessor(LP);
26602
26603   // N.B. the order the invoke BBs are processed in doesn't matter here.
26604   SmallVector<MachineBasicBlock *, 64> MBBLPads;
26605   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26606   for (MachineBasicBlock *MBB : InvokeBBs) {
26607     // Remove the landing pad successor from the invoke block and replace it
26608     // with the new dispatch block.
26609     // Keep a copy of Successors since it's modified inside the loop.
26610     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26611                                                    MBB->succ_rend());
26612     // FIXME: Avoid quadratic complexity.
26613     for (auto MBBS : Successors) {
26614       if (MBBS->isEHPad()) {
26615         MBB->removeSuccessor(MBBS);
26616         MBBLPads.push_back(MBBS);
26617       }
26618     }
26619
26620     MBB->addSuccessor(DispatchBB);
26621
26622     // Find the invoke call and mark all of the callee-saved registers as
26623     // 'implicit defined' so that they're spilled.  This prevents code from
26624     // moving instructions to before the EH block, where they will never be
26625     // executed.
26626     for (auto &II : reverse(*MBB)) {
26627       if (!II.isCall())
26628         continue;
26629
26630       DenseMap<unsigned, bool> DefRegs;
26631       for (auto &MOp : II.operands())
26632         if (MOp.isReg())
26633           DefRegs[MOp.getReg()] = true;
26634
26635       MachineInstrBuilder MIB(*MF, &II);
26636       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26637         unsigned Reg = SavedRegs[RI];
26638         if (!DefRegs[Reg])
26639           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26640       }
26641
26642       break;
26643     }
26644   }
26645
26646   // Mark all former landing pads as non-landing pads.  The dispatch is the only
26647   // landing pad now.
26648   for (auto &LP : MBBLPads)
26649     LP->setIsEHPad(false);
26650
26651   // The instruction is gone now.
26652   MI.eraseFromParent();
26653   return BB;
26654 }
26655
26656 MachineBasicBlock *
26657 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26658                                                MachineBasicBlock *BB) const {
26659   MachineFunction *MF = BB->getParent();
26660   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26661   DebugLoc DL = MI.getDebugLoc();
26662
26663   switch (MI.getOpcode()) {
26664   default: llvm_unreachable("Unexpected instr type to insert");
26665   case X86::TAILJMPd64:
26666   case X86::TAILJMPr64:
26667   case X86::TAILJMPm64:
26668   case X86::TAILJMPr64_REX:
26669   case X86::TAILJMPm64_REX:
26670     llvm_unreachable("TAILJMP64 would not be touched here.");
26671   case X86::TCRETURNdi64:
26672   case X86::TCRETURNri64:
26673   case X86::TCRETURNmi64:
26674     return BB;
26675   case X86::TLS_addr32:
26676   case X86::TLS_addr64:
26677   case X86::TLS_base_addr32:
26678   case X86::TLS_base_addr64:
26679     return EmitLoweredTLSAddr(MI, BB);
26680   case X86::CATCHRET:
26681     return EmitLoweredCatchRet(MI, BB);
26682   case X86::CATCHPAD:
26683     return EmitLoweredCatchPad(MI, BB);
26684   case X86::SEG_ALLOCA_32:
26685   case X86::SEG_ALLOCA_64:
26686     return EmitLoweredSegAlloca(MI, BB);
26687   case X86::TLSCall_32:
26688   case X86::TLSCall_64:
26689     return EmitLoweredTLSCall(MI, BB);
26690   case X86::CMOV_FR32:
26691   case X86::CMOV_FR64:
26692   case X86::CMOV_FR128:
26693   case X86::CMOV_GR8:
26694   case X86::CMOV_GR16:
26695   case X86::CMOV_GR32:
26696   case X86::CMOV_RFP32:
26697   case X86::CMOV_RFP64:
26698   case X86::CMOV_RFP80:
26699   case X86::CMOV_V2F64:
26700   case X86::CMOV_V2I64:
26701   case X86::CMOV_V4F32:
26702   case X86::CMOV_V4F64:
26703   case X86::CMOV_V4I64:
26704   case X86::CMOV_V16F32:
26705   case X86::CMOV_V8F32:
26706   case X86::CMOV_V8F64:
26707   case X86::CMOV_V8I64:
26708   case X86::CMOV_V8I1:
26709   case X86::CMOV_V16I1:
26710   case X86::CMOV_V32I1:
26711   case X86::CMOV_V64I1:
26712     return EmitLoweredSelect(MI, BB);
26713
26714   case X86::RDFLAGS32:
26715   case X86::RDFLAGS64: {
26716     unsigned PushF =
26717         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26718     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26719     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26720     // Permit reads of the FLAGS register without it being defined.
26721     // This intrinsic exists to read external processor state in flags, such as
26722     // the trap flag, interrupt flag, and direction flag, none of which are
26723     // modeled by the backend.
26724     Push->getOperand(2).setIsUndef();
26725     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26726
26727     MI.eraseFromParent(); // The pseudo is gone now.
26728     return BB;
26729   }
26730
26731   case X86::WRFLAGS32:
26732   case X86::WRFLAGS64: {
26733     unsigned Push =
26734         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26735     unsigned PopF =
26736         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26737     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26738     BuildMI(*BB, MI, DL, TII->get(PopF));
26739
26740     MI.eraseFromParent(); // The pseudo is gone now.
26741     return BB;
26742   }
26743
26744   case X86::RELEASE_FADD32mr:
26745   case X86::RELEASE_FADD64mr:
26746     return EmitLoweredAtomicFP(MI, BB);
26747
26748   case X86::FP32_TO_INT16_IN_MEM:
26749   case X86::FP32_TO_INT32_IN_MEM:
26750   case X86::FP32_TO_INT64_IN_MEM:
26751   case X86::FP64_TO_INT16_IN_MEM:
26752   case X86::FP64_TO_INT32_IN_MEM:
26753   case X86::FP64_TO_INT64_IN_MEM:
26754   case X86::FP80_TO_INT16_IN_MEM:
26755   case X86::FP80_TO_INT32_IN_MEM:
26756   case X86::FP80_TO_INT64_IN_MEM: {
26757     // Change the floating point control register to use "round towards zero"
26758     // mode when truncating to an integer value.
26759     int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26760     addFrameReference(BuildMI(*BB, MI, DL,
26761                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
26762
26763     // Load the old value of the high byte of the control word...
26764     unsigned OldCW =
26765       MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26766     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26767                       CWFrameIdx);
26768
26769     // Set the high part to be round to zero...
26770     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26771       .addImm(0xC7F);
26772
26773     // Reload the modified control word now...
26774     addFrameReference(BuildMI(*BB, MI, DL,
26775                               TII->get(X86::FLDCW16m)), CWFrameIdx);
26776
26777     // Restore the memory image of control word to original value
26778     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26779       .addReg(OldCW);
26780
26781     // Get the X86 opcode to use.
26782     unsigned Opc;
26783     switch (MI.getOpcode()) {
26784     default: llvm_unreachable("illegal opcode!");
26785     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26786     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26787     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26788     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26789     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26790     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26791     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26792     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26793     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26794     }
26795
26796     X86AddressMode AM = getAddressFromInstr(&MI, 0);
26797     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26798         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26799
26800     // Reload the original control word now.
26801     addFrameReference(BuildMI(*BB, MI, DL,
26802                               TII->get(X86::FLDCW16m)), CWFrameIdx);
26803
26804     MI.eraseFromParent(); // The pseudo instruction is gone now.
26805     return BB;
26806   }
26807     // String/text processing lowering.
26808   case X86::PCMPISTRM128REG:
26809   case X86::VPCMPISTRM128REG:
26810   case X86::PCMPISTRM128MEM:
26811   case X86::VPCMPISTRM128MEM:
26812   case X86::PCMPESTRM128REG:
26813   case X86::VPCMPESTRM128REG:
26814   case X86::PCMPESTRM128MEM:
26815   case X86::VPCMPESTRM128MEM:
26816     assert(Subtarget.hasSSE42() &&
26817            "Target must have SSE4.2 or AVX features enabled");
26818     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26819
26820   // String/text processing lowering.
26821   case X86::PCMPISTRIREG:
26822   case X86::VPCMPISTRIREG:
26823   case X86::PCMPISTRIMEM:
26824   case X86::VPCMPISTRIMEM:
26825   case X86::PCMPESTRIREG:
26826   case X86::VPCMPESTRIREG:
26827   case X86::PCMPESTRIMEM:
26828   case X86::VPCMPESTRIMEM:
26829     assert(Subtarget.hasSSE42() &&
26830            "Target must have SSE4.2 or AVX features enabled");
26831     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26832
26833   // Thread synchronization.
26834   case X86::MONITOR:
26835     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26836   case X86::MONITORX:
26837     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26838
26839   // Cache line zero
26840   case X86::CLZERO:
26841     return emitClzero(&MI, BB, Subtarget);
26842
26843   // PKU feature
26844   case X86::WRPKRU:
26845     return emitWRPKRU(MI, BB, Subtarget);
26846   case X86::RDPKRU:
26847     return emitRDPKRU(MI, BB, Subtarget);
26848   // xbegin
26849   case X86::XBEGIN:
26850     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26851
26852   case X86::VASTART_SAVE_XMM_REGS:
26853     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26854
26855   case X86::VAARG_64:
26856     return EmitVAARG64WithCustomInserter(MI, BB);
26857
26858   case X86::EH_SjLj_SetJmp32:
26859   case X86::EH_SjLj_SetJmp64:
26860     return emitEHSjLjSetJmp(MI, BB);
26861
26862   case X86::EH_SjLj_LongJmp32:
26863   case X86::EH_SjLj_LongJmp64:
26864     return emitEHSjLjLongJmp(MI, BB);
26865
26866   case X86::Int_eh_sjlj_setup_dispatch:
26867     return EmitSjLjDispatchBlock(MI, BB);
26868
26869   case TargetOpcode::STATEPOINT:
26870     // As an implementation detail, STATEPOINT shares the STACKMAP format at
26871     // this point in the process.  We diverge later.
26872     return emitPatchPoint(MI, BB);
26873
26874   case TargetOpcode::STACKMAP:
26875   case TargetOpcode::PATCHPOINT:
26876     return emitPatchPoint(MI, BB);
26877
26878   case TargetOpcode::PATCHABLE_EVENT_CALL:
26879     // Do nothing here, handle in xray instrumentation pass.
26880     return BB;
26881
26882   case X86::LCMPXCHG8B: {
26883     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26884     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26885     // requires a memory operand. If it happens that current architecture is
26886     // i686 and for current function we need a base pointer
26887     // - which is ESI for i686 - register allocator would not be able to
26888     // allocate registers for an address in form of X(%reg, %reg, Y)
26889     // - there never would be enough unreserved registers during regalloc
26890     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26891     // We are giving a hand to register allocator by precomputing the address in
26892     // a new vreg using LEA.
26893
26894     // If it is not i686 or there is no base pointer - nothing to do here.
26895     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26896       return BB;
26897
26898     // Even though this code does not necessarily needs the base pointer to
26899     // be ESI, we check for that. The reason: if this assert fails, there are
26900     // some changes happened in the compiler base pointer handling, which most
26901     // probably have to be addressed somehow here.
26902     assert(TRI->getBaseRegister() == X86::ESI &&
26903            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26904            "base pointer in mind");
26905
26906     MachineRegisterInfo &MRI = MF->getRegInfo();
26907     MVT SPTy = getPointerTy(MF->getDataLayout());
26908     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26909     unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26910
26911     X86AddressMode AM = getAddressFromInstr(&MI, 0);
26912     // Regalloc does not need any help when the memory operand of CMPXCHG8B
26913     // does not use index register.
26914     if (AM.IndexReg == X86::NoRegister)
26915       return BB;
26916
26917     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26918     // four operand definitions that are E[ABCD] registers. We skip them and
26919     // then insert the LEA.
26920     MachineBasicBlock::iterator MBBI(MI);
26921     while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26922            MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26923       --MBBI;
26924     addFullAddress(
26925         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26926
26927     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26928
26929     return BB;
26930   }
26931   case X86::LCMPXCHG16B:
26932     return BB;
26933   case X86::LCMPXCHG8B_SAVE_EBX:
26934   case X86::LCMPXCHG16B_SAVE_RBX: {
26935     unsigned BasePtr =
26936         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26937     if (!BB->isLiveIn(BasePtr))
26938       BB->addLiveIn(BasePtr);
26939     return BB;
26940   }
26941   }
26942 }
26943
26944 //===----------------------------------------------------------------------===//
26945 //                           X86 Optimization Hooks
26946 //===----------------------------------------------------------------------===//
26947
26948 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26949                                                       KnownBits &Known,
26950                                                       const APInt &DemandedElts,
26951                                                       const SelectionDAG &DAG,
26952                                                       unsigned Depth) const {
26953   unsigned BitWidth = Known.getBitWidth();
26954   unsigned Opc = Op.getOpcode();
26955   EVT VT = Op.getValueType();
26956   assert((Opc >= ISD::BUILTIN_OP_END ||
26957           Opc == ISD::INTRINSIC_WO_CHAIN ||
26958           Opc == ISD::INTRINSIC_W_CHAIN ||
26959           Opc == ISD::INTRINSIC_VOID) &&
26960          "Should use MaskedValueIsZero if you don't know whether Op"
26961          " is a target node!");
26962
26963   Known.resetAll();
26964   switch (Opc) {
26965   default: break;
26966   case X86ISD::ADD:
26967   case X86ISD::SUB:
26968   case X86ISD::ADC:
26969   case X86ISD::SBB:
26970   case X86ISD::SMUL:
26971   case X86ISD::UMUL:
26972   case X86ISD::INC:
26973   case X86ISD::DEC:
26974   case X86ISD::OR:
26975   case X86ISD::XOR:
26976   case X86ISD::AND:
26977     // These nodes' second result is a boolean.
26978     if (Op.getResNo() == 0)
26979       break;
26980     LLVM_FALLTHROUGH;
26981   case X86ISD::SETCC:
26982     Known.Zero.setBitsFrom(1);
26983     break;
26984   case X86ISD::MOVMSK: {
26985     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26986     Known.Zero.setBitsFrom(NumLoBits);
26987     break;
26988   }
26989   case X86ISD::VSHLI:
26990   case X86ISD::VSRLI: {
26991     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26992       if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26993         Known.setAllZero();
26994         break;
26995       }
26996
26997       DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26998       unsigned ShAmt = ShiftImm->getZExtValue();
26999       if (Opc == X86ISD::VSHLI) {
27000         Known.Zero <<= ShAmt;
27001         Known.One <<= ShAmt;
27002         // Low bits are known zero.
27003         Known.Zero.setLowBits(ShAmt);
27004       } else {
27005         Known.Zero.lshrInPlace(ShAmt);
27006         Known.One.lshrInPlace(ShAmt);
27007         // High bits are known zero.
27008         Known.Zero.setHighBits(ShAmt);
27009       }
27010     }
27011     break;
27012   }
27013   case X86ISD::VZEXT: {
27014     SDValue N0 = Op.getOperand(0);
27015     unsigned NumElts = VT.getVectorNumElements();
27016
27017     EVT SrcVT = N0.getValueType();
27018     unsigned InNumElts = SrcVT.getVectorNumElements();
27019     unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27020     assert(InNumElts >= NumElts && "Illegal VZEXT input");
27021
27022     Known = KnownBits(InBitWidth);
27023     APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27024     DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27025     Known = Known.zext(BitWidth);
27026     Known.Zero.setBitsFrom(InBitWidth);
27027     break;
27028   }
27029   }
27030 }
27031
27032 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27033     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27034     unsigned Depth) const {
27035   unsigned VTBits = Op.getScalarValueSizeInBits();
27036   unsigned Opcode = Op.getOpcode();
27037   switch (Opcode) {
27038   case X86ISD::SETCC_CARRY:
27039     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27040     return VTBits;
27041
27042   case X86ISD::VSEXT: {
27043     SDValue Src = Op.getOperand(0);
27044     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27045     Tmp += VTBits - Src.getScalarValueSizeInBits();
27046     return Tmp;
27047   }
27048
27049   case X86ISD::VSHLI: {
27050     SDValue Src = Op.getOperand(0);
27051     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27052     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27053     if (ShiftVal.uge(VTBits))
27054       return VTBits; // Shifted all bits out --> zero.
27055     if (ShiftVal.uge(Tmp))
27056       return 1; // Shifted all sign bits out --> unknown.
27057     return Tmp - ShiftVal.getZExtValue();
27058   }
27059
27060   case X86ISD::VSRAI: {
27061     SDValue Src = Op.getOperand(0);
27062     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27063     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27064     ShiftVal += Tmp;
27065     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27066   }
27067
27068   case X86ISD::PCMPGT:
27069   case X86ISD::PCMPEQ:
27070   case X86ISD::CMPP:
27071   case X86ISD::VPCOM:
27072   case X86ISD::VPCOMU:
27073     // Vector compares return zero/all-bits result values.
27074     return VTBits;
27075   }
27076
27077   // Fallback case.
27078   return 1;
27079 }
27080
27081 /// Returns true (and the GlobalValue and the offset) if the node is a
27082 /// GlobalAddress + offset.
27083 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27084                                        const GlobalValue* &GA,
27085                                        int64_t &Offset) const {
27086   if (N->getOpcode() == X86ISD::Wrapper) {
27087     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27088       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27089       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27090       return true;
27091     }
27092   }
27093   return TargetLowering::isGAPlusOffset(N, GA, Offset);
27094 }
27095
27096 // Attempt to match a combined shuffle mask against supported unary shuffle
27097 // instructions.
27098 // TODO: Investigate sharing more of this with shuffle lowering.
27099 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27100                                     bool AllowFloatDomain, bool AllowIntDomain,
27101                                     SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27102                                     const X86Subtarget &Subtarget,
27103                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27104   unsigned NumMaskElts = Mask.size();
27105   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27106
27107   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27108   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27109   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27110                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27111     unsigned MaxScale = 64 / MaskEltSize;
27112     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27113       bool Match = true;
27114       unsigned NumDstElts = NumMaskElts / Scale;
27115       for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27116         Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27117         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27118       }
27119       if (Match) {
27120         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27121         SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
27122         if (SrcVT != MaskVT)
27123           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27124         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27125         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27126         Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
27127                                   : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27128         return true;
27129       }
27130     }
27131   }
27132
27133   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27134   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27135       isUndefOrEqual(Mask[0], 0) &&
27136       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27137     Shuffle = X86ISD::VZEXT_MOVL;
27138     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27139     return true;
27140   }
27141
27142   // Check if we have SSE3 which will let us use MOVDDUP etc. The
27143   // instructions are no slower than UNPCKLPD but has the option to
27144   // fold the input operand into even an unaligned memory load.
27145   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27146     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
27147       Shuffle = X86ISD::MOVDDUP;
27148       SrcVT = DstVT = MVT::v2f64;
27149       return true;
27150     }
27151     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27152       Shuffle = X86ISD::MOVSLDUP;
27153       SrcVT = DstVT = MVT::v4f32;
27154       return true;
27155     }
27156     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27157       Shuffle = X86ISD::MOVSHDUP;
27158       SrcVT = DstVT = MVT::v4f32;
27159       return true;
27160     }
27161   }
27162
27163   if (MaskVT.is256BitVector() && AllowFloatDomain) {
27164     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27165     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27166       Shuffle = X86ISD::MOVDDUP;
27167       SrcVT = DstVT = MVT::v4f64;
27168       return true;
27169     }
27170     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27171       Shuffle = X86ISD::MOVSLDUP;
27172       SrcVT = DstVT = MVT::v8f32;
27173       return true;
27174     }
27175     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27176       Shuffle = X86ISD::MOVSHDUP;
27177       SrcVT = DstVT = MVT::v8f32;
27178       return true;
27179     }
27180   }
27181
27182   if (MaskVT.is512BitVector() && AllowFloatDomain) {
27183     assert(Subtarget.hasAVX512() &&
27184            "AVX512 required for 512-bit vector shuffles");
27185     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27186       Shuffle = X86ISD::MOVDDUP;
27187       SrcVT = DstVT = MVT::v8f64;
27188       return true;
27189     }
27190     if (isTargetShuffleEquivalent(
27191             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27192       Shuffle = X86ISD::MOVSLDUP;
27193       SrcVT = DstVT = MVT::v16f32;
27194       return true;
27195     }
27196     if (isTargetShuffleEquivalent(
27197             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27198       Shuffle = X86ISD::MOVSHDUP;
27199       SrcVT = DstVT = MVT::v16f32;
27200       return true;
27201     }
27202   }
27203
27204   // Attempt to match against broadcast-from-vector.
27205   if (Subtarget.hasAVX2()) {
27206     SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27207     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27208       SrcVT = DstVT = MaskVT;
27209       Shuffle = X86ISD::VBROADCAST;
27210       return true;
27211     }
27212   }
27213
27214   return false;
27215 }
27216
27217 // Attempt to match a combined shuffle mask against supported unary immediate
27218 // permute instructions.
27219 // TODO: Investigate sharing more of this with shuffle lowering.
27220 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27221                                            const APInt &Zeroable,
27222                                            bool AllowFloatDomain,
27223                                            bool AllowIntDomain,
27224                                            const X86Subtarget &Subtarget,
27225                                            unsigned &Shuffle, MVT &ShuffleVT,
27226                                            unsigned &PermuteImm) {
27227   unsigned NumMaskElts = Mask.size();
27228   unsigned InputSizeInBits = MaskVT.getSizeInBits();
27229   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27230   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27231
27232   bool ContainsZeros =
27233       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27234
27235   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27236   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27237     // Check for lane crossing permutes.
27238     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27239       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27240       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27241         Shuffle = X86ISD::VPERMI;
27242         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27243         PermuteImm = getV4X86ShuffleImm(Mask);
27244         return true;
27245       }
27246       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27247         SmallVector<int, 4> RepeatedMask;
27248         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27249           Shuffle = X86ISD::VPERMI;
27250           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27251           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27252           return true;
27253         }
27254       }
27255     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27256       // VPERMILPD can permute with a non-repeating shuffle.
27257       Shuffle = X86ISD::VPERMILPI;
27258       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27259       PermuteImm = 0;
27260       for (int i = 0, e = Mask.size(); i != e; ++i) {
27261         int M = Mask[i];
27262         if (M == SM_SentinelUndef)
27263           continue;
27264         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27265         PermuteImm |= (M & 1) << i;
27266       }
27267       return true;
27268     }
27269   }
27270
27271   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27272   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27273   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27274   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27275       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27276     SmallVector<int, 4> RepeatedMask;
27277     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27278       // Narrow the repeated mask to create 32-bit element permutes.
27279       SmallVector<int, 4> WordMask = RepeatedMask;
27280       if (MaskScalarSizeInBits == 64)
27281         scaleShuffleMask(2, RepeatedMask, WordMask);
27282
27283       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
27284       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
27285       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27286       PermuteImm = getV4X86ShuffleImm(WordMask);
27287       return true;
27288     }
27289   }
27290
27291   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27292   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
27293     SmallVector<int, 4> RepeatedMask;
27294     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27295       ArrayRef<int> LoMask(Mask.data() + 0, 4);
27296       ArrayRef<int> HiMask(Mask.data() + 4, 4);
27297
27298       // PSHUFLW: permute lower 4 elements only.
27299       if (isUndefOrInRange(LoMask, 0, 4) &&
27300           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27301         Shuffle = X86ISD::PSHUFLW;
27302         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27303         PermuteImm = getV4X86ShuffleImm(LoMask);
27304         return true;
27305       }
27306
27307       // PSHUFHW: permute upper 4 elements only.
27308       if (isUndefOrInRange(HiMask, 4, 8) &&
27309           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27310         // Offset the HiMask so that we can create the shuffle immediate.
27311         int OffsetHiMask[4];
27312         for (int i = 0; i != 4; ++i)
27313           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27314
27315         Shuffle = X86ISD::PSHUFHW;
27316         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27317         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27318         return true;
27319       }
27320     }
27321   }
27322
27323   // Attempt to match against byte/bit shifts.
27324   // FIXME: Add 512-bit support.
27325   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27326                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27327     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27328                                              MaskScalarSizeInBits, Mask,
27329                                              0, Zeroable, Subtarget);
27330     if (0 < ShiftAmt) {
27331       PermuteImm = (unsigned)ShiftAmt;
27332       return true;
27333     }
27334   }
27335
27336   return false;
27337 }
27338
27339 // Attempt to match a combined unary shuffle mask against supported binary
27340 // shuffle instructions.
27341 // TODO: Investigate sharing more of this with shuffle lowering.
27342 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27343                                      bool AllowFloatDomain, bool AllowIntDomain,
27344                                      SDValue &V1, SDValue &V2, SDLoc &DL,
27345                                      SelectionDAG &DAG,
27346                                      const X86Subtarget &Subtarget,
27347                                      unsigned &Shuffle, MVT &ShuffleVT,
27348                                      bool IsUnary) {
27349   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27350
27351   if (MaskVT.is128BitVector()) {
27352     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27353       V2 = V1;
27354       Shuffle = X86ISD::MOVLHPS;
27355       ShuffleVT = MVT::v4f32;
27356       return true;
27357     }
27358     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27359       V2 = V1;
27360       Shuffle = X86ISD::MOVHLPS;
27361       ShuffleVT = MVT::v4f32;
27362       return true;
27363     }
27364     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27365         (AllowFloatDomain || !Subtarget.hasSSE41())) {
27366       std::swap(V1, V2);
27367       Shuffle = X86ISD::MOVSD;
27368       ShuffleVT = MaskVT;
27369       return true;
27370     }
27371     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27372         (AllowFloatDomain || !Subtarget.hasSSE41())) {
27373       Shuffle = X86ISD::MOVSS;
27374       ShuffleVT = MaskVT;
27375       return true;
27376     }
27377   }
27378
27379   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27380   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27381       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27382       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27383       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27384       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27385     if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27386                                     DAG, Subtarget)) {
27387       ShuffleVT = MaskVT;
27388       if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27389         ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27390       return true;
27391     }
27392   }
27393
27394   return false;
27395 }
27396
27397 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27398                                             const APInt &Zeroable,
27399                                             bool AllowFloatDomain,
27400                                             bool AllowIntDomain,
27401                                             SDValue &V1, SDValue &V2, SDLoc &DL,
27402                                             SelectionDAG &DAG,
27403                                             const X86Subtarget &Subtarget,
27404                                             unsigned &Shuffle, MVT &ShuffleVT,
27405                                             unsigned &PermuteImm) {
27406   unsigned NumMaskElts = Mask.size();
27407   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27408
27409   // Attempt to match against PALIGNR byte rotate.
27410   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27411                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27412     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27413     if (0 < ByteRotation) {
27414       Shuffle = X86ISD::PALIGNR;
27415       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27416       PermuteImm = ByteRotation;
27417       return true;
27418     }
27419   }
27420
27421   // Attempt to combine to X86ISD::BLENDI.
27422   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27423                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27424       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27425     uint64_t BlendMask = 0;
27426     bool ForceV1Zero = false, ForceV2Zero = false;
27427     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27428     if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27429                                   BlendMask)) {
27430       if (MaskVT == MVT::v16i16) {
27431         // We can only use v16i16 PBLENDW if the lanes are repeated.
27432         SmallVector<int, 8> RepeatedMask;
27433         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27434                                         RepeatedMask)) {
27435           assert(RepeatedMask.size() == 8 &&
27436                  "Repeated mask size doesn't match!");
27437           PermuteImm = 0;
27438           for (int i = 0; i < 8; ++i)
27439             if (RepeatedMask[i] >= 8)
27440               PermuteImm |= 1 << i;
27441           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27442           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27443           Shuffle = X86ISD::BLENDI;
27444           ShuffleVT = MaskVT;
27445           return true;
27446         }
27447       } else {
27448         // Determine a type compatible with X86ISD::BLENDI.
27449         ShuffleVT = MaskVT;
27450         if (Subtarget.hasAVX2()) {
27451           if (ShuffleVT == MVT::v4i64)
27452             ShuffleVT = MVT::v8i32;
27453           else if (ShuffleVT == MVT::v2i64)
27454             ShuffleVT = MVT::v4i32;
27455         } else {
27456           if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27457             ShuffleVT = MVT::v8i16;
27458           else if (ShuffleVT == MVT::v4i64)
27459             ShuffleVT = MVT::v4f64;
27460           else if (ShuffleVT == MVT::v8i32)
27461             ShuffleVT = MVT::v8f32;
27462         }
27463
27464         if (!ShuffleVT.isFloatingPoint()) {
27465           int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27466           BlendMask =
27467               scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27468           ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27469           ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27470         }
27471
27472         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27473         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27474         PermuteImm = (unsigned)BlendMask;
27475         Shuffle = X86ISD::BLENDI;
27476         return true;
27477       }
27478     }
27479   }
27480
27481   // Attempt to combine to INSERTPS.
27482   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27483       MaskVT.is128BitVector()) {
27484     if (Zeroable.getBoolValue() &&
27485         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27486       Shuffle = X86ISD::INSERTPS;
27487       ShuffleVT = MVT::v4f32;
27488       return true;
27489     }
27490   }
27491
27492   // Attempt to combine to SHUFPD.
27493   if (AllowFloatDomain && EltSizeInBits == 64 &&
27494       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27495        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27496        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27497     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27498       Shuffle = X86ISD::SHUFP;
27499       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27500       return true;
27501     }
27502   }
27503
27504   // Attempt to combine to SHUFPS.
27505   if (AllowFloatDomain && EltSizeInBits == 32 &&
27506       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27507        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27508        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27509     SmallVector<int, 4> RepeatedMask;
27510     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27511       // Match each half of the repeated mask, to determine if its just
27512       // referencing one of the vectors, is zeroable or entirely undef.
27513       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27514         int M0 = RepeatedMask[Offset];
27515         int M1 = RepeatedMask[Offset + 1];
27516
27517         if (isUndefInRange(RepeatedMask, Offset, 2)) {
27518           return DAG.getUNDEF(MaskVT);
27519         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27520           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27521           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27522           return getZeroVector(MaskVT, Subtarget, DAG, DL);
27523         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27524           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27525           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27526           return V1;
27527         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27528           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27529           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27530           return V2;
27531         }
27532
27533         return SDValue();
27534       };
27535
27536       int ShufMask[4] = {-1, -1, -1, -1};
27537       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27538       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27539
27540       if (Lo && Hi) {
27541         V1 = Lo;
27542         V2 = Hi;
27543         Shuffle = X86ISD::SHUFP;
27544         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27545         PermuteImm = getV4X86ShuffleImm(ShufMask);
27546         return true;
27547       }
27548     }
27549   }
27550
27551   return false;
27552 }
27553
27554 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27555 /// possible.
27556 ///
27557 /// This is the leaf of the recursive combine below. When we have found some
27558 /// chain of single-use x86 shuffle instructions and accumulated the combined
27559 /// shuffle mask represented by them, this will try to pattern match that mask
27560 /// into either a single instruction if there is a special purpose instruction
27561 /// for this operation, or into a PSHUFB instruction which is a fully general
27562 /// instruction but should only be used to replace chains over a certain depth.
27563 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27564                                    ArrayRef<int> BaseMask, int Depth,
27565                                    bool HasVariableMask, SelectionDAG &DAG,
27566                                    TargetLowering::DAGCombinerInfo &DCI,
27567                                    const X86Subtarget &Subtarget) {
27568   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27569   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27570          "Unexpected number of shuffle inputs!");
27571
27572   // Find the inputs that enter the chain. Note that multiple uses are OK
27573   // here, we're not going to remove the operands we find.
27574   bool UnaryShuffle = (Inputs.size() == 1);
27575   SDValue V1 = peekThroughBitcasts(Inputs[0]);
27576   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27577                              : peekThroughBitcasts(Inputs[1]));
27578
27579   MVT VT1 = V1.getSimpleValueType();
27580   MVT VT2 = V2.getSimpleValueType();
27581   MVT RootVT = Root.getSimpleValueType();
27582   assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27583          VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27584          "Vector size mismatch");
27585
27586   SDLoc DL(Root);
27587   SDValue Res;
27588
27589   unsigned NumBaseMaskElts = BaseMask.size();
27590   if (NumBaseMaskElts == 1) {
27591     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27592     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27593                   /*AddTo*/ true);
27594     return true;
27595   }
27596
27597   unsigned RootSizeInBits = RootVT.getSizeInBits();
27598   unsigned NumRootElts = RootVT.getVectorNumElements();
27599   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27600   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27601                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27602
27603   // Don't combine if we are a AVX512/EVEX target and the mask element size
27604   // is different from the root element size - this would prevent writemasks
27605   // from being reused.
27606   // TODO - this currently prevents all lane shuffles from occurring.
27607   // TODO - check for writemasks usage instead of always preventing combining.
27608   // TODO - attempt to narrow Mask back to writemask size.
27609   bool IsEVEXShuffle =
27610       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27611   if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27612     return false;
27613
27614   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27615
27616   // Handle 128-bit lane shuffles of 256-bit vectors.
27617   // TODO - this should support binary shuffles.
27618   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27619       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27620     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27621       return false; // Nothing to do!
27622     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27623     unsigned PermMask = 0;
27624     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27625     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27626
27627     Res = DAG.getBitcast(ShuffleVT, V1);
27628     DCI.AddToWorklist(Res.getNode());
27629     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27630                       DAG.getUNDEF(ShuffleVT),
27631                       DAG.getConstant(PermMask, DL, MVT::i8));
27632     DCI.AddToWorklist(Res.getNode());
27633     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27634                   /*AddTo*/ true);
27635     return true;
27636   }
27637
27638   // For masks that have been widened to 128-bit elements or more,
27639   // narrow back down to 64-bit elements.
27640   SmallVector<int, 64> Mask;
27641   if (BaseMaskEltSizeInBits > 64) {
27642     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27643     int MaskScale = BaseMaskEltSizeInBits / 64;
27644     scaleShuffleMask(MaskScale, BaseMask, Mask);
27645   } else {
27646     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27647   }
27648
27649   unsigned NumMaskElts = Mask.size();
27650   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27651
27652   // Determine the effective mask value type.
27653   FloatDomain &= (32 <= MaskEltSizeInBits);
27654   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27655                            : MVT::getIntegerVT(MaskEltSizeInBits);
27656   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27657
27658   // Only allow legal mask types.
27659   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27660     return false;
27661
27662   // Attempt to match the mask against known shuffle patterns.
27663   MVT ShuffleSrcVT, ShuffleVT;
27664   unsigned Shuffle, PermuteImm;
27665
27666   // Which shuffle domains are permitted?
27667   // Permit domain crossing at higher combine depths.
27668   bool AllowFloatDomain = FloatDomain || (Depth > 3);
27669   bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
27670                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
27671
27672   // Determine zeroable mask elements.
27673   APInt Zeroable(NumMaskElts, 0);
27674   for (unsigned i = 0; i != NumMaskElts; ++i)
27675     if (isUndefOrZero(Mask[i]))
27676       Zeroable.setBit(i);
27677
27678   if (UnaryShuffle) {
27679     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27680     // directly if we don't shuffle the lower element and we shuffle the upper
27681     // (zero) elements within themselves.
27682     if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27683         (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27684       unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27685       ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27686       if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27687           isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27688         DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27689                       /*AddTo*/ true);
27690         return true;
27691       }
27692     }
27693
27694     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27695                                 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27696                                 ShuffleVT)) {
27697       if (Depth == 1 && Root.getOpcode() == Shuffle)
27698         return false; // Nothing to do!
27699       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27700         return false; // AVX512 Writemask clash.
27701       Res = DAG.getBitcast(ShuffleSrcVT, V1);
27702       DCI.AddToWorklist(Res.getNode());
27703       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27704       DCI.AddToWorklist(Res.getNode());
27705       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27706                     /*AddTo*/ true);
27707       return true;
27708     }
27709
27710     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27711                                        AllowIntDomain, Subtarget, Shuffle,
27712                                        ShuffleVT, PermuteImm)) {
27713       if (Depth == 1 && Root.getOpcode() == Shuffle)
27714         return false; // Nothing to do!
27715       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27716         return false; // AVX512 Writemask clash.
27717       Res = DAG.getBitcast(ShuffleVT, V1);
27718       DCI.AddToWorklist(Res.getNode());
27719       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27720                         DAG.getConstant(PermuteImm, DL, MVT::i8));
27721       DCI.AddToWorklist(Res.getNode());
27722       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27723                     /*AddTo*/ true);
27724       return true;
27725     }
27726   }
27727
27728   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27729                                V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27730                                UnaryShuffle)) {
27731     if (Depth == 1 && Root.getOpcode() == Shuffle)
27732       return false; // Nothing to do!
27733     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27734       return false; // AVX512 Writemask clash.
27735     V1 = DAG.getBitcast(ShuffleVT, V1);
27736     DCI.AddToWorklist(V1.getNode());
27737     V2 = DAG.getBitcast(ShuffleVT, V2);
27738     DCI.AddToWorklist(V2.getNode());
27739     Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27740     DCI.AddToWorklist(Res.getNode());
27741     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27742                   /*AddTo*/ true);
27743     return true;
27744   }
27745
27746   if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27747                                       AllowIntDomain, V1, V2, DL, DAG,
27748                                       Subtarget, Shuffle, ShuffleVT,
27749                                       PermuteImm)) {
27750     if (Depth == 1 && Root.getOpcode() == Shuffle)
27751       return false; // Nothing to do!
27752     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27753       return false; // AVX512 Writemask clash.
27754     V1 = DAG.getBitcast(ShuffleVT, V1);
27755     DCI.AddToWorklist(V1.getNode());
27756     V2 = DAG.getBitcast(ShuffleVT, V2);
27757     DCI.AddToWorklist(V2.getNode());
27758     Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27759                       DAG.getConstant(PermuteImm, DL, MVT::i8));
27760     DCI.AddToWorklist(Res.getNode());
27761     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27762                   /*AddTo*/ true);
27763     return true;
27764   }
27765
27766   // Typically from here on, we need an integer version of MaskVT.
27767   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
27768   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
27769
27770   // Annoyingly, SSE4A instructions don't map into the above match helpers.
27771   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
27772     uint64_t BitLen, BitIdx;
27773     if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
27774                                   Zeroable)) {
27775       if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
27776         return false; // Nothing to do!
27777       V1 = DAG.getBitcast(IntMaskVT, V1);
27778       DCI.AddToWorklist(V1.getNode());
27779       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
27780                         DAG.getConstant(BitLen, DL, MVT::i8),
27781                         DAG.getConstant(BitIdx, DL, MVT::i8));
27782       DCI.AddToWorklist(Res.getNode());
27783       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27784                     /*AddTo*/ true);
27785       return true;
27786     }
27787
27788     if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
27789       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
27790         return false; // Nothing to do!
27791       V1 = DAG.getBitcast(IntMaskVT, V1);
27792       DCI.AddToWorklist(V1.getNode());
27793       V2 = DAG.getBitcast(IntMaskVT, V2);
27794       DCI.AddToWorklist(V2.getNode());
27795       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
27796                         DAG.getConstant(BitLen, DL, MVT::i8),
27797                         DAG.getConstant(BitIdx, DL, MVT::i8));
27798       DCI.AddToWorklist(Res.getNode());
27799       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27800                     /*AddTo*/ true);
27801       return true;
27802     }
27803   }
27804
27805   // Don't try to re-form single instruction chains under any circumstances now
27806   // that we've done encoding canonicalization for them.
27807   if (Depth < 2)
27808     return false;
27809
27810   bool MaskContainsZeros =
27811       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27812
27813   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27814     // If we have a single input lane-crossing shuffle then lower to VPERMV.
27815     if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27816         ((Subtarget.hasAVX2() &&
27817           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27818          (Subtarget.hasAVX512() &&
27819           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27820            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27821          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27822          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27823          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27824          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27825       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27826       DCI.AddToWorklist(VPermMask.getNode());
27827       Res = DAG.getBitcast(MaskVT, V1);
27828       DCI.AddToWorklist(Res.getNode());
27829       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27830       DCI.AddToWorklist(Res.getNode());
27831       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27832                     /*AddTo*/ true);
27833       return true;
27834     }
27835
27836     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27837     // vector as the second source.
27838     if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27839         ((Subtarget.hasAVX512() &&
27840           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27841            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27842          (Subtarget.hasVLX() &&
27843           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27844            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27845          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27846          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27847          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27848          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27849       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27850       for (unsigned i = 0; i != NumMaskElts; ++i)
27851         if (Mask[i] == SM_SentinelZero)
27852           Mask[i] = NumMaskElts + i;
27853
27854       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27855       DCI.AddToWorklist(VPermMask.getNode());
27856       Res = DAG.getBitcast(MaskVT, V1);
27857       DCI.AddToWorklist(Res.getNode());
27858       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27859       DCI.AddToWorklist(Zero.getNode());
27860       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27861       DCI.AddToWorklist(Res.getNode());
27862       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27863                     /*AddTo*/ true);
27864       return true;
27865     }
27866
27867     // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27868     if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27869         ((Subtarget.hasAVX512() &&
27870           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27871            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27872          (Subtarget.hasVLX() &&
27873           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27874            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27875          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27876          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27877          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27878          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27879       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27880       DCI.AddToWorklist(VPermMask.getNode());
27881       V1 = DAG.getBitcast(MaskVT, V1);
27882       DCI.AddToWorklist(V1.getNode());
27883       V2 = DAG.getBitcast(MaskVT, V2);
27884       DCI.AddToWorklist(V2.getNode());
27885       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27886       DCI.AddToWorklist(Res.getNode());
27887       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27888                     /*AddTo*/ true);
27889       return true;
27890     }
27891     return false;
27892   }
27893
27894   // See if we can combine a single input shuffle with zeros to a bit-mask,
27895   // which is much simpler than any shuffle.
27896   if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27897       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27898       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27899     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27900     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27901     APInt UndefElts(NumMaskElts, 0);
27902     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27903     for (unsigned i = 0; i != NumMaskElts; ++i) {
27904       int M = Mask[i];
27905       if (M == SM_SentinelUndef) {
27906         UndefElts.setBit(i);
27907         continue;
27908       }
27909       if (M == SM_SentinelZero)
27910         continue;
27911       EltBits[i] = AllOnes;
27912     }
27913     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27914     DCI.AddToWorklist(BitMask.getNode());
27915     Res = DAG.getBitcast(MaskVT, V1);
27916     DCI.AddToWorklist(Res.getNode());
27917     unsigned AndOpcode =
27918         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27919     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27920     DCI.AddToWorklist(Res.getNode());
27921     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27922                   /*AddTo*/ true);
27923     return true;
27924   }
27925
27926   // If we have a single input shuffle with different shuffle patterns in the
27927   // the 128-bit lanes use the variable mask to VPERMILPS.
27928   // TODO Combine other mask types at higher depths.
27929   if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27930       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27931        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27932     SmallVector<SDValue, 16> VPermIdx;
27933     for (int M : Mask) {
27934       SDValue Idx =
27935           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27936       VPermIdx.push_back(Idx);
27937     }
27938     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
27939     DCI.AddToWorklist(VPermMask.getNode());
27940     Res = DAG.getBitcast(MaskVT, V1);
27941     DCI.AddToWorklist(Res.getNode());
27942     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27943     DCI.AddToWorklist(Res.getNode());
27944     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27945                   /*AddTo*/ true);
27946     return true;
27947   }
27948
27949   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27950   // to VPERMIL2PD/VPERMIL2PS.
27951   if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27952       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27953        MaskVT == MVT::v8f32)) {
27954     // VPERMIL2 Operation.
27955     // Bits[3] - Match Bit.
27956     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27957     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27958     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27959     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27960     SmallVector<int, 8> VPerm2Idx;
27961     unsigned M2ZImm = 0;
27962     for (int M : Mask) {
27963       if (M == SM_SentinelUndef) {
27964         VPerm2Idx.push_back(-1);
27965         continue;
27966       }
27967       if (M == SM_SentinelZero) {
27968         M2ZImm = 2;
27969         VPerm2Idx.push_back(8);
27970         continue;
27971       }
27972       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27973       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27974       VPerm2Idx.push_back(Index);
27975     }
27976     V1 = DAG.getBitcast(MaskVT, V1);
27977     DCI.AddToWorklist(V1.getNode());
27978     V2 = DAG.getBitcast(MaskVT, V2);
27979     DCI.AddToWorklist(V2.getNode());
27980     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
27981     DCI.AddToWorklist(VPerm2MaskOp.getNode());
27982     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27983                       DAG.getConstant(M2ZImm, DL, MVT::i8));
27984     DCI.AddToWorklist(Res.getNode());
27985     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27986                   /*AddTo*/ true);
27987     return true;
27988   }
27989
27990   // If we have 3 or more shuffle instructions or a chain involving a variable
27991   // mask, we can replace them with a single PSHUFB instruction profitably.
27992   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27993   // instructions, but in practice PSHUFB tends to be *very* fast so we're
27994   // more aggressive.
27995   if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27996       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27997        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27998        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27999     SmallVector<SDValue, 16> PSHUFBMask;
28000     int NumBytes = RootVT.getSizeInBits() / 8;
28001     int Ratio = NumBytes / NumMaskElts;
28002     for (int i = 0; i < NumBytes; ++i) {
28003       int M = Mask[i / Ratio];
28004       if (M == SM_SentinelUndef) {
28005         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28006         continue;
28007       }
28008       if (M == SM_SentinelZero) {
28009         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28010         continue;
28011       }
28012       M = Ratio * M + i % Ratio;
28013       assert ((M / 16) == (i / 16) && "Lane crossing detected");
28014       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28015     }
28016     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28017     Res = DAG.getBitcast(ByteVT, V1);
28018     DCI.AddToWorklist(Res.getNode());
28019     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28020     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28021     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28022     DCI.AddToWorklist(Res.getNode());
28023     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28024                   /*AddTo*/ true);
28025     return true;
28026   }
28027
28028   // With XOP, if we have a 128-bit binary input shuffle we can always combine
28029   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28030   // slower than PSHUFB on targets that support both.
28031   if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
28032       Subtarget.hasXOP()) {
28033     // VPPERM Mask Operation
28034     // Bits[4:0] - Byte Index (0 - 31)
28035     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28036     SmallVector<SDValue, 16> VPPERMMask;
28037     int NumBytes = 16;
28038     int Ratio = NumBytes / NumMaskElts;
28039     for (int i = 0; i < NumBytes; ++i) {
28040       int M = Mask[i / Ratio];
28041       if (M == SM_SentinelUndef) {
28042         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28043         continue;
28044       }
28045       if (M == SM_SentinelZero) {
28046         VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28047         continue;
28048       }
28049       M = Ratio * M + i % Ratio;
28050       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28051     }
28052     MVT ByteVT = MVT::v16i8;
28053     V1 = DAG.getBitcast(ByteVT, V1);
28054     DCI.AddToWorklist(V1.getNode());
28055     V2 = DAG.getBitcast(ByteVT, V2);
28056     DCI.AddToWorklist(V2.getNode());
28057     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28058     DCI.AddToWorklist(VPPERMMaskOp.getNode());
28059     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28060     DCI.AddToWorklist(Res.getNode());
28061     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28062                   /*AddTo*/ true);
28063     return true;
28064   }
28065
28066   // Failed to find any combines.
28067   return false;
28068 }
28069
28070 // Attempt to constant fold all of the constant source ops.
28071 // Returns true if the entire shuffle is folded to a constant.
28072 // TODO: Extend this to merge multiple constant Ops and update the mask.
28073 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28074                                         ArrayRef<int> Mask, SDValue Root,
28075                                         bool HasVariableMask, SelectionDAG &DAG,
28076                                         TargetLowering::DAGCombinerInfo &DCI,
28077                                         const X86Subtarget &Subtarget) {
28078   MVT VT = Root.getSimpleValueType();
28079
28080   unsigned SizeInBits = VT.getSizeInBits();
28081   unsigned NumMaskElts = Mask.size();
28082   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28083   unsigned NumOps = Ops.size();
28084
28085   // Extract constant bits from each source op.
28086   bool OneUseConstantOp = false;
28087   SmallVector<APInt, 16> UndefEltsOps(NumOps);
28088   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28089   for (unsigned i = 0; i != NumOps; ++i) {
28090     SDValue SrcOp = Ops[i];
28091     OneUseConstantOp |= SrcOp.hasOneUse();
28092     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28093                                        RawBitsOps[i]))
28094       return false;
28095   }
28096
28097   // Only fold if at least one of the constants is only used once or
28098   // the combined shuffle has included a variable mask shuffle, this
28099   // is to avoid constant pool bloat.
28100   if (!OneUseConstantOp && !HasVariableMask)
28101     return false;
28102
28103   // Shuffle the constant bits according to the mask.
28104   APInt UndefElts(NumMaskElts, 0);
28105   APInt ZeroElts(NumMaskElts, 0);
28106   APInt ConstantElts(NumMaskElts, 0);
28107   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28108                                         APInt::getNullValue(MaskSizeInBits));
28109   for (unsigned i = 0; i != NumMaskElts; ++i) {
28110     int M = Mask[i];
28111     if (M == SM_SentinelUndef) {
28112       UndefElts.setBit(i);
28113       continue;
28114     } else if (M == SM_SentinelZero) {
28115       ZeroElts.setBit(i);
28116       continue;
28117     }
28118     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28119
28120     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28121     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28122
28123     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28124     if (SrcUndefElts[SrcMaskIdx]) {
28125       UndefElts.setBit(i);
28126       continue;
28127     }
28128
28129     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28130     APInt &Bits = SrcEltBits[SrcMaskIdx];
28131     if (!Bits) {
28132       ZeroElts.setBit(i);
28133       continue;
28134     }
28135
28136     ConstantElts.setBit(i);
28137     ConstantBitData[i] = Bits;
28138   }
28139   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28140
28141   // Create the constant data.
28142   MVT MaskSVT;
28143   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28144     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28145   else
28146     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28147
28148   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28149
28150   SDLoc DL(Root);
28151   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28152   DCI.AddToWorklist(CstOp.getNode());
28153   DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
28154   return true;
28155 }
28156
28157 /// \brief Fully generic combining of x86 shuffle instructions.
28158 ///
28159 /// This should be the last combine run over the x86 shuffle instructions. Once
28160 /// they have been fully optimized, this will recursively consider all chains
28161 /// of single-use shuffle instructions, build a generic model of the cumulative
28162 /// shuffle operation, and check for simpler instructions which implement this
28163 /// operation. We use this primarily for two purposes:
28164 ///
28165 /// 1) Collapse generic shuffles to specialized single instructions when
28166 ///    equivalent. In most cases, this is just an encoding size win, but
28167 ///    sometimes we will collapse multiple generic shuffles into a single
28168 ///    special-purpose shuffle.
28169 /// 2) Look for sequences of shuffle instructions with 3 or more total
28170 ///    instructions, and replace them with the slightly more expensive SSSE3
28171 ///    PSHUFB instruction if available. We do this as the last combining step
28172 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
28173 ///    a suitable short sequence of other instructions. The PSHUFB will either
28174 ///    use a register or have to read from memory and so is slightly (but only
28175 ///    slightly) more expensive than the other shuffle instructions.
28176 ///
28177 /// Because this is inherently a quadratic operation (for each shuffle in
28178 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28179 /// This should never be an issue in practice as the shuffle lowering doesn't
28180 /// produce sequences of more than 8 instructions.
28181 ///
28182 /// FIXME: We will currently miss some cases where the redundant shuffling
28183 /// would simplify under the threshold for PSHUFB formation because of
28184 /// combine-ordering. To fix this, we should do the redundant instruction
28185 /// combining in this recursive walk.
28186 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28187                                           int SrcOpIndex, SDValue Root,
28188                                           ArrayRef<int> RootMask,
28189                                           ArrayRef<const SDNode*> SrcNodes,
28190                                           int Depth, bool HasVariableMask,
28191                                           SelectionDAG &DAG,
28192                                           TargetLowering::DAGCombinerInfo &DCI,
28193                                           const X86Subtarget &Subtarget) {
28194   // Bound the depth of our recursive combine because this is ultimately
28195   // quadratic in nature.
28196   if (Depth > 8)
28197     return false;
28198
28199   // Directly rip through bitcasts to find the underlying operand.
28200   SDValue Op = SrcOps[SrcOpIndex];
28201   Op = peekThroughOneUseBitcasts(Op);
28202
28203   MVT VT = Op.getSimpleValueType();
28204   if (!VT.isVector())
28205     return false; // Bail if we hit a non-vector.
28206
28207   assert(Root.getSimpleValueType().isVector() &&
28208          "Shuffles operate on vector types!");
28209   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28210          "Can only combine shuffles of the same vector register size.");
28211
28212   // Extract target shuffle mask and resolve sentinels and inputs.
28213   SmallVector<int, 64> OpMask;
28214   SmallVector<SDValue, 2> OpInputs;
28215   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28216     return false;
28217
28218   assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28219   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28220   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28221
28222   // Add the inputs to the Ops list, avoiding duplicates.
28223   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28224
28225   int InputIdx0 = -1, InputIdx1 = -1;
28226   for (int i = 0, e = Ops.size(); i < e; ++i) {
28227     SDValue BC = peekThroughBitcasts(Ops[i]);
28228     if (Input0 && BC == peekThroughBitcasts(Input0))
28229       InputIdx0 = i;
28230     if (Input1 && BC == peekThroughBitcasts(Input1))
28231       InputIdx1 = i;
28232   }
28233
28234   if (Input0 && InputIdx0 < 0) {
28235     InputIdx0 = SrcOpIndex;
28236     Ops[SrcOpIndex] = Input0;
28237   }
28238   if (Input1 && InputIdx1 < 0) {
28239     InputIdx1 = Ops.size();
28240     Ops.push_back(Input1);
28241   }
28242
28243   assert(((RootMask.size() > OpMask.size() &&
28244            RootMask.size() % OpMask.size() == 0) ||
28245           (OpMask.size() > RootMask.size() &&
28246            OpMask.size() % RootMask.size() == 0) ||
28247           OpMask.size() == RootMask.size()) &&
28248          "The smaller number of elements must divide the larger.");
28249
28250   // This function can be performance-critical, so we rely on the power-of-2
28251   // knowledge that we have about the mask sizes to replace div/rem ops with
28252   // bit-masks and shifts.
28253   assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28254   assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28255   unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28256   unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28257
28258   unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28259   unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28260   unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28261   assert((RootRatio == 1 || OpRatio == 1) &&
28262          "Must not have a ratio for both incoming and op masks!");
28263
28264   assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28265   assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28266   assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28267   unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28268   unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28269
28270   SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28271
28272   // Merge this shuffle operation's mask into our accumulated mask. Note that
28273   // this shuffle's mask will be the first applied to the input, followed by the
28274   // root mask to get us all the way to the root value arrangement. The reason
28275   // for this order is that we are recursing up the operation chain.
28276   for (unsigned i = 0; i < MaskWidth; ++i) {
28277     unsigned RootIdx = i >> RootRatioLog2;
28278     if (RootMask[RootIdx] < 0) {
28279       // This is a zero or undef lane, we're done.
28280       Mask[i] = RootMask[RootIdx];
28281       continue;
28282     }
28283
28284     unsigned RootMaskedIdx =
28285         RootRatio == 1
28286             ? RootMask[RootIdx]
28287             : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28288
28289     // Just insert the scaled root mask value if it references an input other
28290     // than the SrcOp we're currently inserting.
28291     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28292         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28293       Mask[i] = RootMaskedIdx;
28294       continue;
28295     }
28296
28297     RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28298     unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28299     if (OpMask[OpIdx] < 0) {
28300       // The incoming lanes are zero or undef, it doesn't matter which ones we
28301       // are using.
28302       Mask[i] = OpMask[OpIdx];
28303       continue;
28304     }
28305
28306     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28307     unsigned OpMaskedIdx =
28308         OpRatio == 1
28309             ? OpMask[OpIdx]
28310             : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28311
28312     OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28313     if (OpMask[OpIdx] < (int)OpMask.size()) {
28314       assert(0 <= InputIdx0 && "Unknown target shuffle input");
28315       OpMaskedIdx += InputIdx0 * MaskWidth;
28316     } else {
28317       assert(0 <= InputIdx1 && "Unknown target shuffle input");
28318       OpMaskedIdx += InputIdx1 * MaskWidth;
28319     }
28320
28321     Mask[i] = OpMaskedIdx;
28322   }
28323
28324   // Handle the all undef/zero cases early.
28325   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28326     DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28327     return true;
28328   }
28329   if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28330     // TODO - should we handle the mixed zero/undef case as well? Just returning
28331     // a zero mask will lose information on undef elements possibly reducing
28332     // future combine possibilities.
28333     DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28334                                                 Subtarget, DAG, SDLoc(Root)));
28335     return true;
28336   }
28337
28338   // Remove unused shuffle source ops.
28339   resolveTargetShuffleInputsAndMask(Ops, Mask);
28340   assert(!Ops.empty() && "Shuffle with no inputs detected");
28341
28342   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28343
28344   // Update the list of shuffle nodes that have been combined so far.
28345   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28346                                                 SrcNodes.end());
28347   CombinedNodes.push_back(Op.getNode());
28348
28349   // See if we can recurse into each shuffle source op (if it's a target
28350   // shuffle). The source op should only be combined if it either has a
28351   // single use (i.e. current Op) or all its users have already been combined.
28352   for (int i = 0, e = Ops.size(); i < e; ++i)
28353     if (Ops[i].getNode()->hasOneUse() ||
28354         SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28355       if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28356                                         Depth + 1, HasVariableMask, DAG, DCI,
28357                                         Subtarget))
28358         return true;
28359
28360   // Attempt to constant fold all of the constant source ops.
28361   if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28362                                   Subtarget))
28363     return true;
28364
28365   // We can only combine unary and binary shuffle mask cases.
28366   if (Ops.size() > 2)
28367     return false;
28368
28369   // Minor canonicalization of the accumulated shuffle mask to make it easier
28370   // to match below. All this does is detect masks with sequential pairs of
28371   // elements, and shrink them to the half-width mask. It does this in a loop
28372   // so it will reduce the size of the mask to the minimal width mask which
28373   // performs an equivalent shuffle.
28374   SmallVector<int, 64> WidenedMask;
28375   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28376     Mask = std::move(WidenedMask);
28377   }
28378
28379   // Canonicalization of binary shuffle masks to improve pattern matching by
28380   // commuting the inputs.
28381   if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28382     ShuffleVectorSDNode::commuteMask(Mask);
28383     std::swap(Ops[0], Ops[1]);
28384   }
28385
28386   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28387                                 DCI, Subtarget);
28388 }
28389
28390 /// \brief Get the PSHUF-style mask from PSHUF node.
28391 ///
28392 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28393 /// PSHUF-style masks that can be reused with such instructions.
28394 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28395   MVT VT = N.getSimpleValueType();
28396   SmallVector<int, 4> Mask;
28397   SmallVector<SDValue, 2> Ops;
28398   bool IsUnary;
28399   bool HaveMask =
28400       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28401   (void)HaveMask;
28402   assert(HaveMask);
28403
28404   // If we have more than 128-bits, only the low 128-bits of shuffle mask
28405   // matter. Check that the upper masks are repeats and remove them.
28406   if (VT.getSizeInBits() > 128) {
28407     int LaneElts = 128 / VT.getScalarSizeInBits();
28408 #ifndef NDEBUG
28409     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28410       for (int j = 0; j < LaneElts; ++j)
28411         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28412                "Mask doesn't repeat in high 128-bit lanes!");
28413 #endif
28414     Mask.resize(LaneElts);
28415   }
28416
28417   switch (N.getOpcode()) {
28418   case X86ISD::PSHUFD:
28419     return Mask;
28420   case X86ISD::PSHUFLW:
28421     Mask.resize(4);
28422     return Mask;
28423   case X86ISD::PSHUFHW:
28424     Mask.erase(Mask.begin(), Mask.begin() + 4);
28425     for (int &M : Mask)
28426       M -= 4;
28427     return Mask;
28428   default:
28429     llvm_unreachable("No valid shuffle instruction found!");
28430   }
28431 }
28432
28433 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28434 ///
28435 /// We walk up the chain and look for a combinable shuffle, skipping over
28436 /// shuffles that we could hoist this shuffle's transformation past without
28437 /// altering anything.
28438 static SDValue
28439 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28440                              SelectionDAG &DAG) {
28441   assert(N.getOpcode() == X86ISD::PSHUFD &&
28442          "Called with something other than an x86 128-bit half shuffle!");
28443   SDLoc DL(N);
28444
28445   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28446   // of the shuffles in the chain so that we can form a fresh chain to replace
28447   // this one.
28448   SmallVector<SDValue, 8> Chain;
28449   SDValue V = N.getOperand(0);
28450   for (; V.hasOneUse(); V = V.getOperand(0)) {
28451     switch (V.getOpcode()) {
28452     default:
28453       return SDValue(); // Nothing combined!
28454
28455     case ISD::BITCAST:
28456       // Skip bitcasts as we always know the type for the target specific
28457       // instructions.
28458       continue;
28459
28460     case X86ISD::PSHUFD:
28461       // Found another dword shuffle.
28462       break;
28463
28464     case X86ISD::PSHUFLW:
28465       // Check that the low words (being shuffled) are the identity in the
28466       // dword shuffle, and the high words are self-contained.
28467       if (Mask[0] != 0 || Mask[1] != 1 ||
28468           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28469         return SDValue();
28470
28471       Chain.push_back(V);
28472       continue;
28473
28474     case X86ISD::PSHUFHW:
28475       // Check that the high words (being shuffled) are the identity in the
28476       // dword shuffle, and the low words are self-contained.
28477       if (Mask[2] != 2 || Mask[3] != 3 ||
28478           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28479         return SDValue();
28480
28481       Chain.push_back(V);
28482       continue;
28483
28484     case X86ISD::UNPCKL:
28485     case X86ISD::UNPCKH:
28486       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28487       // shuffle into a preceding word shuffle.
28488       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28489           V.getSimpleValueType().getVectorElementType() != MVT::i16)
28490         return SDValue();
28491
28492       // Search for a half-shuffle which we can combine with.
28493       unsigned CombineOp =
28494           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28495       if (V.getOperand(0) != V.getOperand(1) ||
28496           !V->isOnlyUserOf(V.getOperand(0).getNode()))
28497         return SDValue();
28498       Chain.push_back(V);
28499       V = V.getOperand(0);
28500       do {
28501         switch (V.getOpcode()) {
28502         default:
28503           return SDValue(); // Nothing to combine.
28504
28505         case X86ISD::PSHUFLW:
28506         case X86ISD::PSHUFHW:
28507           if (V.getOpcode() == CombineOp)
28508             break;
28509
28510           Chain.push_back(V);
28511
28512           LLVM_FALLTHROUGH;
28513         case ISD::BITCAST:
28514           V = V.getOperand(0);
28515           continue;
28516         }
28517         break;
28518       } while (V.hasOneUse());
28519       break;
28520     }
28521     // Break out of the loop if we break out of the switch.
28522     break;
28523   }
28524
28525   if (!V.hasOneUse())
28526     // We fell out of the loop without finding a viable combining instruction.
28527     return SDValue();
28528
28529   // Merge this node's mask and our incoming mask.
28530   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28531   for (int &M : Mask)
28532     M = VMask[M];
28533   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28534                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28535
28536   // Rebuild the chain around this new shuffle.
28537   while (!Chain.empty()) {
28538     SDValue W = Chain.pop_back_val();
28539
28540     if (V.getValueType() != W.getOperand(0).getValueType())
28541       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28542
28543     switch (W.getOpcode()) {
28544     default:
28545       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28546
28547     case X86ISD::UNPCKL:
28548     case X86ISD::UNPCKH:
28549       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28550       break;
28551
28552     case X86ISD::PSHUFD:
28553     case X86ISD::PSHUFLW:
28554     case X86ISD::PSHUFHW:
28555       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28556       break;
28557     }
28558   }
28559   if (V.getValueType() != N.getValueType())
28560     V = DAG.getBitcast(N.getValueType(), V);
28561
28562   // Return the new chain to replace N.
28563   return V;
28564 }
28565
28566 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28567 /// pshufhw.
28568 ///
28569 /// We walk up the chain, skipping shuffles of the other half and looking
28570 /// through shuffles which switch halves trying to find a shuffle of the same
28571 /// pair of dwords.
28572 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28573                                         SelectionDAG &DAG,
28574                                         TargetLowering::DAGCombinerInfo &DCI) {
28575   assert(
28576       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28577       "Called with something other than an x86 128-bit half shuffle!");
28578   SDLoc DL(N);
28579   unsigned CombineOpcode = N.getOpcode();
28580
28581   // Walk up a single-use chain looking for a combinable shuffle.
28582   SDValue V = N.getOperand(0);
28583   for (; V.hasOneUse(); V = V.getOperand(0)) {
28584     switch (V.getOpcode()) {
28585     default:
28586       return false; // Nothing combined!
28587
28588     case ISD::BITCAST:
28589       // Skip bitcasts as we always know the type for the target specific
28590       // instructions.
28591       continue;
28592
28593     case X86ISD::PSHUFLW:
28594     case X86ISD::PSHUFHW:
28595       if (V.getOpcode() == CombineOpcode)
28596         break;
28597
28598       // Other-half shuffles are no-ops.
28599       continue;
28600     }
28601     // Break out of the loop if we break out of the switch.
28602     break;
28603   }
28604
28605   if (!V.hasOneUse())
28606     // We fell out of the loop without finding a viable combining instruction.
28607     return false;
28608
28609   // Combine away the bottom node as its shuffle will be accumulated into
28610   // a preceding shuffle.
28611   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28612
28613   // Record the old value.
28614   SDValue Old = V;
28615
28616   // Merge this node's mask and our incoming mask (adjusted to account for all
28617   // the pshufd instructions encountered).
28618   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28619   for (int &M : Mask)
28620     M = VMask[M];
28621   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28622                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28623
28624   // Check that the shuffles didn't cancel each other out. If not, we need to
28625   // combine to the new one.
28626   if (Old != V)
28627     // Replace the combinable shuffle with the combined one, updating all users
28628     // so that we re-evaluate the chain here.
28629     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28630
28631   return true;
28632 }
28633
28634 /// \brief Try to combine x86 target specific shuffles.
28635 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28636                                     TargetLowering::DAGCombinerInfo &DCI,
28637                                     const X86Subtarget &Subtarget) {
28638   SDLoc DL(N);
28639   MVT VT = N.getSimpleValueType();
28640   SmallVector<int, 4> Mask;
28641
28642   unsigned Opcode = N.getOpcode();
28643   switch (Opcode) {
28644   case X86ISD::PSHUFD:
28645   case X86ISD::PSHUFLW:
28646   case X86ISD::PSHUFHW:
28647     Mask = getPSHUFShuffleMask(N);
28648     assert(Mask.size() == 4);
28649     break;
28650   case X86ISD::UNPCKL: {
28651     auto Op0 = N.getOperand(0);
28652     auto Op1 = N.getOperand(1);
28653     unsigned Opcode0 = Op0.getOpcode();
28654     unsigned Opcode1 = Op1.getOpcode();
28655
28656     // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28657     // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28658     // TODO: Add other horizontal operations as required.
28659     if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28660       return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28661
28662     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28663     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28664     // moves upper half elements into the lower half part. For example:
28665     //
28666     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28667     //     undef:v16i8
28668     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28669     //
28670     // will be combined to:
28671     //
28672     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28673
28674     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28675     // happen due to advanced instructions.
28676     if (!VT.is128BitVector())
28677       return SDValue();
28678
28679     if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28680       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28681
28682       unsigned NumElts = VT.getVectorNumElements();
28683       SmallVector<int, 8> ExpectedMask(NumElts, -1);
28684       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28685                 NumElts / 2);
28686
28687       auto ShufOp = Op1.getOperand(0);
28688       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28689         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28690     }
28691     return SDValue();
28692   }
28693   case X86ISD::BLENDI: {
28694     SDValue V0 = N->getOperand(0);
28695     SDValue V1 = N->getOperand(1);
28696     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28697            "Unexpected input vector types");
28698
28699     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28700     // operands and changing the mask to 1. This saves us a bunch of
28701     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28702     // x86InstrInfo knows how to commute this back after instruction selection
28703     // if it would help register allocation.
28704
28705     // TODO: If optimizing for size or a processor that doesn't suffer from
28706     // partial register update stalls, this should be transformed into a MOVSD
28707     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28708
28709     if (VT == MVT::v2f64)
28710       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28711         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28712           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28713           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28714         }
28715
28716     return SDValue();
28717   }
28718   case X86ISD::MOVSD:
28719   case X86ISD::MOVSS: {
28720     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28721     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28722     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28723     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28724     if (isZero0 && isZero1)
28725       return SDValue();
28726
28727     // We often lower to MOVSD/MOVSS from integer as well as native float
28728     // types; remove unnecessary domain-crossing bitcasts if we can to make it
28729     // easier to combine shuffles later on. We've already accounted for the
28730     // domain switching cost when we decided to lower with it.
28731     bool isFloat = VT.isFloatingPoint();
28732     bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28733     bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28734     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28735       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28736                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28737       V0 = DAG.getBitcast(NewVT, V0);
28738       V1 = DAG.getBitcast(NewVT, V1);
28739       return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28740     }
28741
28742     return SDValue();
28743   }
28744   case X86ISD::INSERTPS: {
28745     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28746     SDValue Op0 = N.getOperand(0);
28747     SDValue Op1 = N.getOperand(1);
28748     SDValue Op2 = N.getOperand(2);
28749     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28750     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28751     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28752     unsigned ZeroMask = InsertPSMask & 0xF;
28753
28754     // If we zero out all elements from Op0 then we don't need to reference it.
28755     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28756       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28757                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28758
28759     // If we zero out the element from Op1 then we don't need to reference it.
28760     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28761       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28762                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28763
28764     // Attempt to merge insertps Op1 with an inner target shuffle node.
28765     SmallVector<int, 8> TargetMask1;
28766     SmallVector<SDValue, 2> Ops1;
28767     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28768       int M = TargetMask1[SrcIdx];
28769       if (isUndefOrZero(M)) {
28770         // Zero/UNDEF insertion - zero out element and remove dependency.
28771         InsertPSMask |= (1u << DstIdx);
28772         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28773                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
28774       }
28775       // Update insertps mask srcidx and reference the source input directly.
28776       assert(0 <= M && M < 8 && "Shuffle index out of range");
28777       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28778       Op1 = Ops1[M < 4 ? 0 : 1];
28779       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28780                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28781     }
28782
28783     // Attempt to merge insertps Op0 with an inner target shuffle node.
28784     SmallVector<int, 8> TargetMask0;
28785     SmallVector<SDValue, 2> Ops0;
28786     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28787       return SDValue();
28788
28789     bool Updated = false;
28790     bool UseInput00 = false;
28791     bool UseInput01 = false;
28792     for (int i = 0; i != 4; ++i) {
28793       int M = TargetMask0[i];
28794       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28795         // No change if element is already zero or the inserted element.
28796         continue;
28797       } else if (isUndefOrZero(M)) {
28798         // If the target mask is undef/zero then we must zero the element.
28799         InsertPSMask |= (1u << i);
28800         Updated = true;
28801         continue;
28802       }
28803
28804       // The input vector element must be inline.
28805       if (M != i && M != (i + 4))
28806         return SDValue();
28807
28808       // Determine which inputs of the target shuffle we're using.
28809       UseInput00 |= (0 <= M && M < 4);
28810       UseInput01 |= (4 <= M);
28811     }
28812
28813     // If we're not using both inputs of the target shuffle then use the
28814     // referenced input directly.
28815     if (UseInput00 && !UseInput01) {
28816       Updated = true;
28817       Op0 = Ops0[0];
28818     } else if (!UseInput00 && UseInput01) {
28819       Updated = true;
28820       Op0 = Ops0[1];
28821     }
28822
28823     if (Updated)
28824       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28825                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28826
28827     return SDValue();
28828   }
28829   default:
28830     return SDValue();
28831   }
28832
28833   // Nuke no-op shuffles that show up after combining.
28834   if (isNoopShuffleMask(Mask))
28835     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28836
28837   // Look for simplifications involving one or two shuffle instructions.
28838   SDValue V = N.getOperand(0);
28839   switch (N.getOpcode()) {
28840   default:
28841     break;
28842   case X86ISD::PSHUFLW:
28843   case X86ISD::PSHUFHW:
28844     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28845
28846     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28847       return SDValue(); // We combined away this shuffle, so we're done.
28848
28849     // See if this reduces to a PSHUFD which is no more expensive and can
28850     // combine with more operations. Note that it has to at least flip the
28851     // dwords as otherwise it would have been removed as a no-op.
28852     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28853       int DMask[] = {0, 1, 2, 3};
28854       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28855       DMask[DOffset + 0] = DOffset + 1;
28856       DMask[DOffset + 1] = DOffset + 0;
28857       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28858       V = DAG.getBitcast(DVT, V);
28859       DCI.AddToWorklist(V.getNode());
28860       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28861                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28862       DCI.AddToWorklist(V.getNode());
28863       return DAG.getBitcast(VT, V);
28864     }
28865
28866     // Look for shuffle patterns which can be implemented as a single unpack.
28867     // FIXME: This doesn't handle the location of the PSHUFD generically, and
28868     // only works when we have a PSHUFD followed by two half-shuffles.
28869     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28870         (V.getOpcode() == X86ISD::PSHUFLW ||
28871          V.getOpcode() == X86ISD::PSHUFHW) &&
28872         V.getOpcode() != N.getOpcode() &&
28873         V.hasOneUse()) {
28874       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28875       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28876         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28877         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28878         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28879         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28880         int WordMask[8];
28881         for (int i = 0; i < 4; ++i) {
28882           WordMask[i + NOffset] = Mask[i] + NOffset;
28883           WordMask[i + VOffset] = VMask[i] + VOffset;
28884         }
28885         // Map the word mask through the DWord mask.
28886         int MappedMask[8];
28887         for (int i = 0; i < 8; ++i)
28888           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28889         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28890             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28891           // We can replace all three shuffles with an unpack.
28892           V = DAG.getBitcast(VT, D.getOperand(0));
28893           DCI.AddToWorklist(V.getNode());
28894           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28895                                                 : X86ISD::UNPCKH,
28896                              DL, VT, V, V);
28897         }
28898       }
28899     }
28900
28901     break;
28902
28903   case X86ISD::PSHUFD:
28904     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28905       return NewN;
28906
28907     break;
28908   }
28909
28910   return SDValue();
28911 }
28912
28913 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28914 /// operation. If true is returned then the operands of ADDSUB operation
28915 /// are written to the parameters \p Opnd0 and \p Opnd1.
28916 ///
28917 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28918 /// so it is easier to generically match. We also insert dummy vector shuffle
28919 /// nodes for the operands which explicitly discard the lanes which are unused
28920 /// by this operation to try to flow through the rest of the combiner
28921 /// the fact that they're unused.
28922 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28923                      SDValue &Opnd0, SDValue &Opnd1) {
28924
28925   EVT VT = N->getValueType(0);
28926   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28927       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28928       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28929     return false;
28930
28931   // We only handle target-independent shuffles.
28932   // FIXME: It would be easy and harmless to use the target shuffle mask
28933   // extraction tool to support more.
28934   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28935     return false;
28936
28937   ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28938   SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28939
28940   SDValue V1 = N->getOperand(0);
28941   SDValue V2 = N->getOperand(1);
28942
28943   // We require the first shuffle operand to be the FSUB node, and the second to
28944   // be the FADD node.
28945   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28946     ShuffleVectorSDNode::commuteMask(Mask);
28947     std::swap(V1, V2);
28948   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28949     return false;
28950
28951   // If there are other uses of these operations we can't fold them.
28952   if (!V1->hasOneUse() || !V2->hasOneUse())
28953     return false;
28954
28955   // Ensure that both operations have the same operands. Note that we can
28956   // commute the FADD operands.
28957   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28958   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28959       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28960     return false;
28961
28962   // We're looking for blends between FADD and FSUB nodes. We insist on these
28963   // nodes being lined up in a specific expected pattern.
28964   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28965         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28966         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28967         isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28968                                            8, 25, 10, 27, 12, 29, 14, 31})))
28969     return false;
28970
28971   Opnd0 = LHS;
28972   Opnd1 = RHS;
28973   return true;
28974 }
28975
28976 /// \brief Try to combine a shuffle into a target-specific add-sub or
28977 /// mul-add-sub node.
28978 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28979                                                 const X86Subtarget &Subtarget,
28980                                                 SelectionDAG &DAG) {
28981   SDValue Opnd0, Opnd1;
28982   if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28983     return SDValue();
28984
28985   EVT VT = N->getValueType(0);
28986   SDLoc DL(N);
28987
28988   // Try to generate X86ISD::FMADDSUB node here.
28989   SDValue Opnd2;
28990   if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28991     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28992
28993   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28994   // the ADDSUB idiom has been successfully recognized. There are no known
28995   // X86 targets with 512-bit ADDSUB instructions!
28996   if (VT.is512BitVector())
28997     return SDValue();
28998
28999   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29000 }
29001
29002 // We are looking for a shuffle where both sources are concatenated with undef
29003 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29004 // if we can express this as a single-source shuffle, that's preferable.
29005 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29006                                            const X86Subtarget &Subtarget) {
29007   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29008     return SDValue();
29009
29010   EVT VT = N->getValueType(0);
29011
29012   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29013   if (!VT.is128BitVector() && !VT.is256BitVector())
29014     return SDValue();
29015
29016   if (VT.getVectorElementType() != MVT::i32 &&
29017       VT.getVectorElementType() != MVT::i64 &&
29018       VT.getVectorElementType() != MVT::f32 &&
29019       VT.getVectorElementType() != MVT::f64)
29020     return SDValue();
29021
29022   SDValue N0 = N->getOperand(0);
29023   SDValue N1 = N->getOperand(1);
29024
29025   // Check that both sources are concats with undef.
29026   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29027       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29028       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29029       !N1.getOperand(1).isUndef())
29030     return SDValue();
29031
29032   // Construct the new shuffle mask. Elements from the first source retain their
29033   // index, but elements from the second source no longer need to skip an undef.
29034   SmallVector<int, 8> Mask;
29035   int NumElts = VT.getVectorNumElements();
29036
29037   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29038   for (int Elt : SVOp->getMask())
29039     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29040
29041   SDLoc DL(N);
29042   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29043                                N1.getOperand(0));
29044   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29045 }
29046
29047 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29048                               TargetLowering::DAGCombinerInfo &DCI,
29049                               const X86Subtarget &Subtarget) {
29050   SDLoc dl(N);
29051   EVT VT = N->getValueType(0);
29052   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29053   // If we have legalized the vector types, look for blends of FADD and FSUB
29054   // nodes that we can fuse into an ADDSUB node.
29055   if (TLI.isTypeLegal(VT))
29056     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29057       return AddSub;
29058
29059   // During Type Legalization, when promoting illegal vector types,
29060   // the backend might introduce new shuffle dag nodes and bitcasts.
29061   //
29062   // This code performs the following transformation:
29063   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29064   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29065   //
29066   // We do this only if both the bitcast and the BINOP dag nodes have
29067   // one use. Also, perform this transformation only if the new binary
29068   // operation is legal. This is to avoid introducing dag nodes that
29069   // potentially need to be further expanded (or custom lowered) into a
29070   // less optimal sequence of dag nodes.
29071   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29072       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29073       N->getOperand(0).getOpcode() == ISD::BITCAST &&
29074       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29075     SDValue N0 = N->getOperand(0);
29076     SDValue N1 = N->getOperand(1);
29077
29078     SDValue BC0 = N0.getOperand(0);
29079     EVT SVT = BC0.getValueType();
29080     unsigned Opcode = BC0.getOpcode();
29081     unsigned NumElts = VT.getVectorNumElements();
29082
29083     if (BC0.hasOneUse() && SVT.isVector() &&
29084         SVT.getVectorNumElements() * 2 == NumElts &&
29085         TLI.isOperationLegal(Opcode, VT)) {
29086       bool CanFold = false;
29087       switch (Opcode) {
29088       default : break;
29089       case ISD::ADD:
29090       case ISD::SUB:
29091       case ISD::MUL:
29092         // isOperationLegal lies for integer ops on floating point types.
29093         CanFold = VT.isInteger();
29094         break;
29095       case ISD::FADD:
29096       case ISD::FSUB:
29097       case ISD::FMUL:
29098         // isOperationLegal lies for floating point ops on integer types.
29099         CanFold = VT.isFloatingPoint();
29100         break;
29101       }
29102
29103       unsigned SVTNumElts = SVT.getVectorNumElements();
29104       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29105       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29106         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29107       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29108         CanFold = SVOp->getMaskElt(i) < 0;
29109
29110       if (CanFold) {
29111         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29112         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29113         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29114         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29115       }
29116     }
29117   }
29118
29119   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29120   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29121   // consecutive, non-overlapping, and in the right order.
29122   SmallVector<SDValue, 16> Elts;
29123   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29124     if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29125       Elts.push_back(Elt);
29126       continue;
29127     }
29128     Elts.clear();
29129     break;
29130   }
29131
29132   if (Elts.size() == VT.getVectorNumElements())
29133     if (SDValue LD =
29134             EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29135       return LD;
29136
29137   // For AVX2, we sometimes want to combine
29138   // (vector_shuffle <mask> (concat_vectors t1, undef)
29139   //                        (concat_vectors t2, undef))
29140   // Into:
29141   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29142   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29143   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29144     return ShufConcat;
29145
29146   if (isTargetShuffle(N->getOpcode())) {
29147     SDValue Op(N, 0);
29148     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29149       return Shuffle;
29150
29151     // Try recursively combining arbitrary sequences of x86 shuffle
29152     // instructions into higher-order shuffles. We do this after combining
29153     // specific PSHUF instruction sequences into their minimal form so that we
29154     // can evaluate how many specialized shuffle instructions are involved in
29155     // a particular chain.
29156     SmallVector<int, 1> NonceMask; // Just a placeholder.
29157     NonceMask.push_back(0);
29158     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
29159                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
29160                                       DCI, Subtarget))
29161       return SDValue(); // This routine will use CombineTo to replace N.
29162   }
29163
29164   return SDValue();
29165 }
29166
29167 /// Check if a vector extract from a target-specific shuffle of a load can be
29168 /// folded into a single element load.
29169 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29170 /// shuffles have been custom lowered so we need to handle those here.
29171 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29172                                          TargetLowering::DAGCombinerInfo &DCI) {
29173   if (DCI.isBeforeLegalizeOps())
29174     return SDValue();
29175
29176   SDValue InVec = N->getOperand(0);
29177   SDValue EltNo = N->getOperand(1);
29178   EVT EltVT = N->getValueType(0);
29179
29180   if (!isa<ConstantSDNode>(EltNo))
29181     return SDValue();
29182
29183   EVT OriginalVT = InVec.getValueType();
29184
29185   // Peek through bitcasts, don't duplicate a load with other uses.
29186   InVec = peekThroughOneUseBitcasts(InVec);
29187
29188   EVT CurrentVT = InVec.getValueType();
29189   if (!CurrentVT.isVector() ||
29190       CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29191     return SDValue();
29192
29193   if (!isTargetShuffle(InVec.getOpcode()))
29194     return SDValue();
29195
29196   // Don't duplicate a load with other uses.
29197   if (!InVec.hasOneUse())
29198     return SDValue();
29199
29200   SmallVector<int, 16> ShuffleMask;
29201   SmallVector<SDValue, 2> ShuffleOps;
29202   bool UnaryShuffle;
29203   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29204                             ShuffleOps, ShuffleMask, UnaryShuffle))
29205     return SDValue();
29206
29207   // Select the input vector, guarding against out of range extract vector.
29208   unsigned NumElems = CurrentVT.getVectorNumElements();
29209   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29210   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29211
29212   if (Idx == SM_SentinelZero)
29213     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29214                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29215   if (Idx == SM_SentinelUndef)
29216     return DAG.getUNDEF(EltVT);
29217
29218   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29219   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29220                                          : ShuffleOps[1];
29221
29222   // If inputs to shuffle are the same for both ops, then allow 2 uses
29223   unsigned AllowedUses =
29224       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29225
29226   if (LdNode.getOpcode() == ISD::BITCAST) {
29227     // Don't duplicate a load with other uses.
29228     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29229       return SDValue();
29230
29231     AllowedUses = 1; // only allow 1 load use if we have a bitcast
29232     LdNode = LdNode.getOperand(0);
29233   }
29234
29235   if (!ISD::isNormalLoad(LdNode.getNode()))
29236     return SDValue();
29237
29238   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29239
29240   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29241     return SDValue();
29242
29243   // If there's a bitcast before the shuffle, check if the load type and
29244   // alignment is valid.
29245   unsigned Align = LN0->getAlignment();
29246   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29247   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29248       EltVT.getTypeForEVT(*DAG.getContext()));
29249
29250   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29251     return SDValue();
29252
29253   // All checks match so transform back to vector_shuffle so that DAG combiner
29254   // can finish the job
29255   SDLoc dl(N);
29256
29257   // Create shuffle node taking into account the case that its a unary shuffle
29258   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29259   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29260                                  ShuffleMask);
29261   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29262   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29263                      EltNo);
29264 }
29265
29266 // Try to match patterns such as
29267 // (i16 bitcast (v16i1 x))
29268 // ->
29269 // (i16 movmsk (16i8 sext (v16i1 x)))
29270 // before the illegal vector is scalarized on subtargets that don't have legal
29271 // vxi1 types.
29272 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29273                                   const X86Subtarget &Subtarget) {
29274   EVT VT = BitCast.getValueType();
29275   SDValue N0 = BitCast.getOperand(0);
29276   EVT VecVT = N0->getValueType(0);
29277
29278   if (!VT.isScalarInteger() || !VecVT.isSimple())
29279     return SDValue();
29280
29281   // With AVX512 vxi1 types are legal and we prefer using k-regs.
29282   // MOVMSK is supported in SSE2 or later.
29283   if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
29284     return SDValue();
29285
29286   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29287   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29288   // v8i16 and v16i16.
29289   // For these two cases, we can shuffle the upper element bytes to a
29290   // consecutive sequence at the start of the vector and treat the results as
29291   // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
29292   // for v16i16 this is not the case, because the shuffle is expensive, so we
29293   // avoid sign-extending to this type entirely.
29294   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29295   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29296   MVT SExtVT;
29297   MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29298   switch (VecVT.getSimpleVT().SimpleTy) {
29299   default:
29300     return SDValue();
29301   case MVT::v2i1:
29302     SExtVT = MVT::v2i64;
29303     FPCastVT = MVT::v2f64;
29304     break;
29305   case MVT::v4i1:
29306     SExtVT = MVT::v4i32;
29307     FPCastVT = MVT::v4f32;
29308     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29309     // sign-extend to a 256-bit operation to avoid truncation.
29310     if (N0->getOpcode() == ISD::SETCC &&
29311         N0->getOperand(0)->getValueType(0).is256BitVector() &&
29312         Subtarget.hasInt256()) {
29313       SExtVT = MVT::v4i64;
29314       FPCastVT = MVT::v4f64;
29315     }
29316     break;
29317   case MVT::v8i1:
29318     SExtVT = MVT::v8i16;
29319     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29320     // sign-extend to a 256-bit operation to match the compare.
29321     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29322     // 256-bit because the shuffle is cheaper than sign extending the result of
29323     // the compare.
29324     if (N0->getOpcode() == ISD::SETCC &&
29325         N0->getOperand(0)->getValueType(0).is256BitVector() &&
29326         Subtarget.hasInt256()) {
29327       SExtVT = MVT::v8i32;
29328       FPCastVT = MVT::v8f32;
29329     }
29330     break;
29331   case MVT::v16i1:
29332     SExtVT = MVT::v16i8;
29333     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29334     // it is not profitable to sign-extend to 256-bit because this will
29335     // require an extra cross-lane shuffle which is more expensive than
29336     // truncating the result of the compare to 128-bits.
29337     break;
29338   case MVT::v32i1:
29339     // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
29340     if (!Subtarget.hasInt256())
29341       return SDValue();
29342     SExtVT = MVT::v32i8;
29343     break;
29344   };
29345
29346   SDLoc DL(BitCast);
29347   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29348   if (SExtVT == MVT::v8i16) {
29349     V = DAG.getBitcast(MVT::v16i8, V);
29350     V = DAG.getVectorShuffle(
29351         MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29352         {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29353   } else
29354     assert(SExtVT.getScalarType() != MVT::i16 &&
29355            "Vectors of i16 must be shuffled");
29356   if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29357     V = DAG.getBitcast(FPCastVT, V);
29358   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29359   return DAG.getZExtOrTrunc(V, DL, VT);
29360 }
29361
29362 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29363                               TargetLowering::DAGCombinerInfo &DCI,
29364                               const X86Subtarget &Subtarget) {
29365   SDValue N0 = N->getOperand(0);
29366   EVT VT = N->getValueType(0);
29367   EVT SrcVT = N0.getValueType();
29368
29369   // Try to match patterns such as
29370   // (i16 bitcast (v16i1 x))
29371   // ->
29372   // (i16 movmsk (16i8 sext (v16i1 x)))
29373   // before the setcc result is scalarized on subtargets that don't have legal
29374   // vxi1 types.
29375   if (DCI.isBeforeLegalize())
29376     if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29377       return V;
29378   // Since MMX types are special and don't usually play with other vector types,
29379   // it's better to handle them early to be sure we emit efficient code by
29380   // avoiding store-load conversions.
29381
29382   // Detect bitcasts between i32 to x86mmx low word.
29383   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29384       SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29385     SDValue N00 = N0->getOperand(0);
29386     if (N00.getValueType() == MVT::i32)
29387       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29388   }
29389
29390   // Detect bitcasts between element or subvector extraction to x86mmx.
29391   if (VT == MVT::x86mmx &&
29392       (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29393        N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29394       isNullConstant(N0.getOperand(1))) {
29395     SDValue N00 = N0->getOperand(0);
29396     if (N00.getValueType().is128BitVector())
29397       return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29398                          DAG.getBitcast(MVT::v2i64, N00));
29399   }
29400
29401   // Detect bitcasts from FP_TO_SINT to x86mmx.
29402   if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29403       N0.getOpcode() == ISD::FP_TO_SINT) {
29404     SDLoc DL(N0);
29405     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29406                               DAG.getUNDEF(MVT::v2i32));
29407     return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29408                        DAG.getBitcast(MVT::v2i64, Res));
29409   }
29410
29411   // Convert a bitcasted integer logic operation that has one bitcasted
29412   // floating-point operand into a floating-point logic operation. This may
29413   // create a load of a constant, but that is cheaper than materializing the
29414   // constant in an integer register and transferring it to an SSE register or
29415   // transferring the SSE operand to integer register and back.
29416   unsigned FPOpcode;
29417   switch (N0.getOpcode()) {
29418     case ISD::AND: FPOpcode = X86ISD::FAND; break;
29419     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
29420     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29421     default: return SDValue();
29422   }
29423
29424   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29425         (Subtarget.hasSSE2() && VT == MVT::f64)))
29426     return SDValue();
29427
29428   SDValue LogicOp0 = N0.getOperand(0);
29429   SDValue LogicOp1 = N0.getOperand(1);
29430   SDLoc DL0(N0);
29431
29432   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29433   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29434       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29435       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29436     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29437     return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29438   }
29439   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29440   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29441       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29442       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29443     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29444     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29445   }
29446
29447   return SDValue();
29448 }
29449
29450 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29451 // the elements of a vector.
29452 // Returns the vector that is being reduced on, or SDValue() if a reduction
29453 // was not matched.
29454 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29455   // The pattern must end in an extract from index 0.
29456   if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29457       !isNullConstant(Extract->getOperand(1)))
29458     return SDValue();
29459
29460   unsigned Stages =
29461       Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29462
29463   SDValue Op = Extract->getOperand(0);
29464   // At each stage, we're looking for something that looks like:
29465   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29466   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29467   //                               i32 undef, i32 undef, i32 undef, i32 undef>
29468   // %a = binop <8 x i32> %op, %s
29469   // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29470   // we expect something like:
29471   // <4,5,6,7,u,u,u,u>
29472   // <2,3,u,u,u,u,u,u>
29473   // <1,u,u,u,u,u,u,u>
29474   for (unsigned i = 0; i < Stages; ++i) {
29475     if (Op.getOpcode() != BinOp)
29476       return SDValue();
29477
29478     ShuffleVectorSDNode *Shuffle =
29479         dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29480     if (Shuffle) {
29481       Op = Op.getOperand(1);
29482     } else {
29483       Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29484       Op = Op.getOperand(0);
29485     }
29486
29487     // The first operand of the shuffle should be the same as the other operand
29488     // of the add.
29489     if (!Shuffle || (Shuffle->getOperand(0) != Op))
29490       return SDValue();
29491
29492     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29493     for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29494       if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29495         return SDValue();
29496   }
29497
29498   return Op;
29499 }
29500
29501 // Given a select, detect the following pattern:
29502 // 1:    %2 = zext <N x i8> %0 to <N x i32>
29503 // 2:    %3 = zext <N x i8> %1 to <N x i32>
29504 // 3:    %4 = sub nsw <N x i32> %2, %3
29505 // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29506 // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
29507 // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29508 // This is useful as it is the input into a SAD pattern.
29509 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29510                               SDValue &Op1) {
29511   // Check the condition of the select instruction is greater-than.
29512   SDValue SetCC = Select->getOperand(0);
29513   if (SetCC.getOpcode() != ISD::SETCC)
29514     return false;
29515   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29516   if (CC != ISD::SETGT && CC != ISD::SETLT)
29517     return false;
29518
29519   SDValue SelectOp1 = Select->getOperand(1);
29520   SDValue SelectOp2 = Select->getOperand(2);
29521
29522   // The following instructions assume SelectOp1 is the subtraction operand
29523   // and SelectOp2 is the negation operand.
29524   // In the case of SETLT this is the other way around.
29525   if (CC == ISD::SETLT)
29526     std::swap(SelectOp1, SelectOp2);
29527
29528   // The second operand of the select should be the negation of the first
29529   // operand, which is implemented as 0 - SelectOp1.
29530   if (!(SelectOp2.getOpcode() == ISD::SUB &&
29531         ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29532         SelectOp2.getOperand(1) == SelectOp1))
29533     return false;
29534
29535   // The first operand of SetCC is the first operand of the select, which is the
29536   // difference between the two input vectors.
29537   if (SetCC.getOperand(0) != SelectOp1)
29538     return false;
29539
29540   // In SetLT case, The second operand of the comparison can be either 1 or 0.
29541   APInt SplatVal;
29542   if ((CC == ISD::SETLT) &&
29543       !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29544          SplatVal == 1) ||
29545         (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29546     return false;
29547
29548   // In SetGT case, The second operand of the comparison can be either -1 or 0.
29549   if ((CC == ISD::SETGT) &&
29550       !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29551         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29552     return false;
29553
29554   // The first operand of the select is the difference between the two input
29555   // vectors.
29556   if (SelectOp1.getOpcode() != ISD::SUB)
29557     return false;
29558
29559   Op0 = SelectOp1.getOperand(0);
29560   Op1 = SelectOp1.getOperand(1);
29561
29562   // Check if the operands of the sub are zero-extended from vectors of i8.
29563   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29564       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29565       Op1.getOpcode() != ISD::ZERO_EXTEND ||
29566       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29567     return false;
29568
29569   return true;
29570 }
29571
29572 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29573 // to these zexts.
29574 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29575                             const SDValue &Zext1, const SDLoc &DL) {
29576
29577   // Find the appropriate width for the PSADBW.
29578   EVT InVT = Zext0.getOperand(0).getValueType();
29579   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29580
29581   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29582   // fill in the missing vector elements with 0.
29583   unsigned NumConcat = RegSize / InVT.getSizeInBits();
29584   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29585   Ops[0] = Zext0.getOperand(0);
29586   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29587   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29588   Ops[0] = Zext1.getOperand(0);
29589   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29590
29591   // Actually build the SAD
29592   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29593   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29594 }
29595
29596 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29597 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29598                                                 SelectionDAG &DAG,
29599                                                 const X86Subtarget &Subtarget) {
29600   // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29601   if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29602     return SDValue();
29603
29604   EVT ExtractVT = Extract->getValueType(0);
29605   unsigned BitWidth = ExtractVT.getSizeInBits();
29606   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29607       ExtractVT != MVT::i8)
29608     return SDValue();
29609
29610   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29611   for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29612     SDValue Match = matchBinOpReduction(Extract, Op);
29613     if (!Match)
29614       continue;
29615
29616     // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29617     // which we can't support here for now.
29618     if (Match.getScalarValueSizeInBits() != BitWidth)
29619       continue;
29620
29621     // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29622     unsigned MatchSizeInBits = Match.getValueSizeInBits();
29623     if (!(MatchSizeInBits == 128 ||
29624           (MatchSizeInBits == 256 &&
29625            ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29626       return SDValue();
29627
29628     // Don't bother performing this for 2-element vectors.
29629     if (Match.getValueType().getVectorNumElements() <= 2)
29630       return SDValue();
29631
29632     // Check that we are extracting a reduction of all sign bits.
29633     if (DAG.ComputeNumSignBits(Match) != BitWidth)
29634       return SDValue();
29635
29636     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29637     MVT MaskVT;
29638     if (64 == BitWidth || 32 == BitWidth)
29639       MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29640                                 MatchSizeInBits / BitWidth);
29641     else
29642       MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29643
29644     APInt CompareBits;
29645     ISD::CondCode CondCode;
29646     if (Op == ISD::OR) {
29647       // any_of -> MOVMSK != 0
29648       CompareBits = APInt::getNullValue(32);
29649       CondCode = ISD::CondCode::SETNE;
29650     } else {
29651       // all_of -> MOVMSK == ((1 << NumElts) - 1)
29652       CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29653       CondCode = ISD::CondCode::SETEQ;
29654     }
29655
29656     // Perform the select as i32/i64 and then truncate to avoid partial register
29657     // stalls.
29658     unsigned ResWidth = std::max(BitWidth, 32u);
29659     EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29660     SDLoc DL(Extract);
29661     SDValue Zero = DAG.getConstant(0, DL, ResVT);
29662     SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29663     SDValue Res = DAG.getBitcast(MaskVT, Match);
29664     Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29665     Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29666                           Ones, Zero, CondCode);
29667     return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29668   }
29669
29670   return SDValue();
29671 }
29672
29673 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29674                                       const X86Subtarget &Subtarget) {
29675   // PSADBW is only supported on SSE2 and up.
29676   if (!Subtarget.hasSSE2())
29677     return SDValue();
29678
29679   // Verify the type we're extracting from is any integer type above i16.
29680   EVT VT = Extract->getOperand(0).getValueType();
29681   if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29682     return SDValue();
29683
29684   unsigned RegSize = 128;
29685   if (Subtarget.hasBWI())
29686     RegSize = 512;
29687   else if (Subtarget.hasAVX2())
29688     RegSize = 256;
29689
29690   // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29691   // TODO: We should be able to handle larger vectors by splitting them before
29692   // feeding them into several SADs, and then reducing over those.
29693   if (RegSize / VT.getVectorNumElements() < 8)
29694     return SDValue();
29695
29696   // Match shuffle + add pyramid.
29697   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29698
29699   // The operand is expected to be zero extended from i8
29700   // (verified in detectZextAbsDiff).
29701   // In order to convert to i64 and above, additional any/zero/sign
29702   // extend is expected.
29703   // The zero extend from 32 bit has no mathematical effect on the result.
29704   // Also the sign extend is basically zero extend
29705   // (extends the sign bit which is zero).
29706   // So it is correct to skip the sign/zero extend instruction.
29707   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29708     Root.getOpcode() == ISD::ZERO_EXTEND ||
29709     Root.getOpcode() == ISD::ANY_EXTEND))
29710     Root = Root.getOperand(0);
29711
29712   // If there was a match, we want Root to be a select that is the root of an
29713   // abs-diff pattern.
29714   if (!Root || (Root.getOpcode() != ISD::VSELECT))
29715     return SDValue();
29716
29717   // Check whether we have an abs-diff pattern feeding into the select.
29718   SDValue Zext0, Zext1;
29719   if (!detectZextAbsDiff(Root, Zext0, Zext1))
29720     return SDValue();
29721
29722   // Create the SAD instruction.
29723   SDLoc DL(Extract);
29724   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29725
29726   // If the original vector was wider than 8 elements, sum over the results
29727   // in the SAD vector.
29728   unsigned Stages = Log2_32(VT.getVectorNumElements());
29729   MVT SadVT = SAD.getSimpleValueType();
29730   if (Stages > 3) {
29731     unsigned SadElems = SadVT.getVectorNumElements();
29732
29733     for(unsigned i = Stages - 3; i > 0; --i) {
29734       SmallVector<int, 16> Mask(SadElems, -1);
29735       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29736         Mask[j] = MaskEnd + j;
29737
29738       SDValue Shuffle =
29739           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29740       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29741     }
29742   }
29743
29744   MVT Type = Extract->getSimpleValueType(0);
29745   unsigned TypeSizeInBits = Type.getSizeInBits();
29746   // Return the lowest TypeSizeInBits bits.
29747   MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29748   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29749   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29750                      Extract->getOperand(1));
29751 }
29752
29753 // Attempt to peek through a target shuffle and extract the scalar from the
29754 // source.
29755 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29756                                          TargetLowering::DAGCombinerInfo &DCI,
29757                                          const X86Subtarget &Subtarget) {
29758   if (DCI.isBeforeLegalizeOps())
29759     return SDValue();
29760
29761   SDValue Src = N->getOperand(0);
29762   SDValue Idx = N->getOperand(1);
29763
29764   EVT VT = N->getValueType(0);
29765   EVT SrcVT = Src.getValueType();
29766   EVT SrcSVT = SrcVT.getVectorElementType();
29767   unsigned NumSrcElts = SrcVT.getVectorNumElements();
29768
29769   // Don't attempt this for boolean mask vectors or unknown extraction indices.
29770   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29771     return SDValue();
29772
29773   // Resolve the target shuffle inputs and mask.
29774   SmallVector<int, 16> Mask;
29775   SmallVector<SDValue, 2> Ops;
29776   if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29777     return SDValue();
29778
29779   // Attempt to narrow/widen the shuffle mask to the correct size.
29780   if (Mask.size() != NumSrcElts) {
29781     if ((NumSrcElts % Mask.size()) == 0) {
29782       SmallVector<int, 16> ScaledMask;
29783       int Scale = NumSrcElts / Mask.size();
29784       scaleShuffleMask(Scale, Mask, ScaledMask);
29785       Mask = std::move(ScaledMask);
29786     } else if ((Mask.size() % NumSrcElts) == 0) {
29787       SmallVector<int, 16> WidenedMask;
29788       while (Mask.size() > NumSrcElts &&
29789              canWidenShuffleElements(Mask, WidenedMask))
29790         Mask = std::move(WidenedMask);
29791       // TODO - investigate support for wider shuffle masks with known upper
29792       // undef/zero elements for implicit zero-extension.
29793     }
29794   }
29795
29796   // Check if narrowing/widening failed.
29797   if (Mask.size() != NumSrcElts)
29798     return SDValue();
29799
29800   int SrcIdx = Mask[N->getConstantOperandVal(1)];
29801   SDLoc dl(N);
29802
29803   // If the shuffle source element is undef/zero then we can just accept it.
29804   if (SrcIdx == SM_SentinelUndef)
29805     return DAG.getUNDEF(VT);
29806
29807   if (SrcIdx == SM_SentinelZero)
29808     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29809                                 : DAG.getConstant(0, dl, VT);
29810
29811   SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29812   SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29813   SrcIdx = SrcIdx % Mask.size();
29814
29815   // We can only extract other elements from 128-bit vectors and in certain
29816   // circumstances, depending on SSE-level.
29817   // TODO: Investigate using extract_subvector for larger vectors.
29818   // TODO: Investigate float/double extraction if it will be just stored.
29819   if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29820       ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29821     assert(SrcSVT == VT && "Unexpected extraction type");
29822     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29823                        DAG.getIntPtrConstant(SrcIdx, dl));
29824   }
29825
29826   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29827       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29828     assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29829            "Unexpected extraction type");
29830     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29831     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29832                                 DAG.getIntPtrConstant(SrcIdx, dl));
29833     SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29834                                  DAG.getValueType(SrcSVT));
29835     return DAG.getZExtOrTrunc(Assert, dl, VT);
29836   }
29837
29838   return SDValue();
29839 }
29840
29841 /// Detect vector gather/scatter index generation and convert it from being a
29842 /// bunch of shuffles and extracts into a somewhat faster sequence.
29843 /// For i686, the best sequence is apparently storing the value and loading
29844 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29845 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29846                                        TargetLowering::DAGCombinerInfo &DCI,
29847                                        const X86Subtarget &Subtarget) {
29848   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29849     return NewOp;
29850
29851   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29852     return NewOp;
29853
29854   SDValue InputVector = N->getOperand(0);
29855   SDValue EltIdx = N->getOperand(1);
29856
29857   EVT SrcVT = InputVector.getValueType();
29858   EVT VT = N->getValueType(0);
29859   SDLoc dl(InputVector);
29860
29861   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29862   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29863       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29864     SDValue MMXSrc = InputVector.getOperand(0);
29865
29866     // The bitcast source is a direct mmx result.
29867     if (MMXSrc.getValueType() == MVT::x86mmx)
29868       return DAG.getBitcast(VT, InputVector);
29869   }
29870
29871   // Detect mmx to i32 conversion through a v2i32 elt extract.
29872   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29873       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29874     SDValue MMXSrc = InputVector.getOperand(0);
29875
29876     // The bitcast source is a direct mmx result.
29877     if (MMXSrc.getValueType() == MVT::x86mmx)
29878       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29879   }
29880
29881   if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29882       isa<ConstantSDNode>(EltIdx) &&
29883       isa<ConstantSDNode>(InputVector.getOperand(0))) {
29884     uint64_t ExtractedElt = N->getConstantOperandVal(1);
29885     uint64_t InputValue = InputVector.getConstantOperandVal(0);
29886     uint64_t Res = (InputValue >> ExtractedElt) & 1;
29887     return DAG.getConstant(Res, dl, MVT::i1);
29888   }
29889
29890   // Check whether this extract is the root of a sum of absolute differences
29891   // pattern. This has to be done here because we really want it to happen
29892   // pre-legalization,
29893   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29894     return SAD;
29895
29896   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29897   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29898     return Cmp;
29899
29900   // Only operate on vectors of 4 elements, where the alternative shuffling
29901   // gets to be more expensive.
29902   if (SrcVT != MVT::v4i32)
29903     return SDValue();
29904
29905   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29906   // single use which is a sign-extend or zero-extend, and all elements are
29907   // used.
29908   SmallVector<SDNode *, 4> Uses;
29909   unsigned ExtractedElements = 0;
29910   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29911        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29912     if (UI.getUse().getResNo() != InputVector.getResNo())
29913       return SDValue();
29914
29915     SDNode *Extract = *UI;
29916     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29917       return SDValue();
29918
29919     if (Extract->getValueType(0) != MVT::i32)
29920       return SDValue();
29921     if (!Extract->hasOneUse())
29922       return SDValue();
29923     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29924         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29925       return SDValue();
29926     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29927       return SDValue();
29928
29929     // Record which element was extracted.
29930     ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29931     Uses.push_back(Extract);
29932   }
29933
29934   // If not all the elements were used, this may not be worthwhile.
29935   if (ExtractedElements != 15)
29936     return SDValue();
29937
29938   // Ok, we've now decided to do the transformation.
29939   // If 64-bit shifts are legal, use the extract-shift sequence,
29940   // otherwise bounce the vector off the cache.
29941   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29942   SDValue Vals[4];
29943
29944   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29945     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29946     auto &DL = DAG.getDataLayout();
29947     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29948     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29949       DAG.getConstant(0, dl, VecIdxTy));
29950     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29951       DAG.getConstant(1, dl, VecIdxTy));
29952
29953     SDValue ShAmt = DAG.getConstant(
29954         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29955     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29956     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29957       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29958     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29959     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29960       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29961   } else {
29962     // Store the value to a temporary stack slot.
29963     SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29964     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29965                               MachinePointerInfo());
29966
29967     EVT ElementType = SrcVT.getVectorElementType();
29968     unsigned EltSize = ElementType.getSizeInBits() / 8;
29969
29970     // Replace each use (extract) with a load of the appropriate element.
29971     for (unsigned i = 0; i < 4; ++i) {
29972       uint64_t Offset = EltSize * i;
29973       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29974       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29975
29976       SDValue ScalarAddr =
29977           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29978
29979       // Load the scalar.
29980       Vals[i] =
29981           DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29982     }
29983   }
29984
29985   // Replace the extracts
29986   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29987     UE = Uses.end(); UI != UE; ++UI) {
29988     SDNode *Extract = *UI;
29989
29990     uint64_t IdxVal = Extract->getConstantOperandVal(1);
29991     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29992   }
29993
29994   // The replacement was made in place; don't return anything.
29995   return SDValue();
29996 }
29997
29998 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29999 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
30000 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30001 // combineBasicSADPattern.
30002 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
30003                                            TargetLowering::DAGCombinerInfo &DCI,
30004                                            const X86Subtarget &Subtarget) {
30005   return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
30006 }
30007
30008 /// If a vector select has an operand that is -1 or 0, try to simplify the
30009 /// select to a bitwise logic operation.
30010 static SDValue
30011 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30012                                  TargetLowering::DAGCombinerInfo &DCI,
30013                                  const X86Subtarget &Subtarget) {
30014   SDValue Cond = N->getOperand(0);
30015   SDValue LHS = N->getOperand(1);
30016   SDValue RHS = N->getOperand(2);
30017   EVT VT = LHS.getValueType();
30018   EVT CondVT = Cond.getValueType();
30019   SDLoc DL(N);
30020   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30021
30022   if (N->getOpcode() != ISD::VSELECT)
30023     return SDValue();
30024
30025   assert(CondVT.isVector() && "Vector select expects a vector selector!");
30026
30027   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30028   // Check if the first operand is all zeros and Cond type is vXi1.
30029   // This situation only applies to avx512.
30030   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30031       CondVT.getVectorElementType() == MVT::i1) {
30032     // Invert the cond to not(cond) : xor(op,allones)=not(op)
30033     SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30034                                   DAG.getAllOnesConstant(DL, CondVT));
30035     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30036     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30037   }
30038
30039   // To use the condition operand as a bitwise mask, it must have elements that
30040   // are the same size as the select elements. Ie, the condition operand must
30041   // have already been promoted from the IR select condition type <N x i1>.
30042   // Don't check if the types themselves are equal because that excludes
30043   // vector floating-point selects.
30044   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30045     return SDValue();
30046
30047   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30048   FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30049
30050   // Try to invert the condition if true value is not all 1s and false value is
30051   // not all 0s.
30052   if (!TValIsAllOnes && !FValIsAllZeros &&
30053       // Check if the selector will be produced by CMPP*/PCMP*.
30054       Cond.getOpcode() == ISD::SETCC &&
30055       // Check if SETCC has already been promoted.
30056       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30057           CondVT) {
30058     bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30059     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30060
30061     if (TValIsAllZeros || FValIsAllOnes) {
30062       SDValue CC = Cond.getOperand(2);
30063       ISD::CondCode NewCC =
30064           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30065                                Cond.getOperand(0).getValueType().isInteger());
30066       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30067                           NewCC);
30068       std::swap(LHS, RHS);
30069       TValIsAllOnes = FValIsAllOnes;
30070       FValIsAllZeros = TValIsAllZeros;
30071     }
30072   }
30073
30074   // vselect Cond, 111..., 000... -> Cond
30075   if (TValIsAllOnes && FValIsAllZeros)
30076     return DAG.getBitcast(VT, Cond);
30077
30078   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30079     return SDValue();
30080
30081   // vselect Cond, 111..., X -> or Cond, X
30082   if (TValIsAllOnes) {
30083     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30084     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30085     return DAG.getBitcast(VT, Or);
30086   }
30087
30088   // vselect Cond, X, 000... -> and Cond, X
30089   if (FValIsAllZeros) {
30090     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30091     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30092     return DAG.getBitcast(VT, And);
30093   }
30094
30095   return SDValue();
30096 }
30097
30098 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30099   SDValue Cond = N->getOperand(0);
30100   SDValue LHS = N->getOperand(1);
30101   SDValue RHS = N->getOperand(2);
30102   SDLoc DL(N);
30103
30104   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30105   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30106   if (!TrueC || !FalseC)
30107     return SDValue();
30108
30109   // Don't do this for crazy integer types.
30110   if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
30111     return SDValue();
30112
30113   // If this is efficiently invertible, canonicalize the LHSC/RHSC values
30114   // so that TrueC (the true value) is larger than FalseC.
30115   bool NeedsCondInvert = false;
30116   if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
30117       // Efficiently invertible.
30118       (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
30119        (Cond.getOpcode() == ISD::XOR &&  // xor(X, C) -> invertible.
30120         isa<ConstantSDNode>(Cond.getOperand(1))))) {
30121     NeedsCondInvert = true;
30122     std::swap(TrueC, FalseC);
30123   }
30124
30125   // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
30126   if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30127     if (NeedsCondInvert) // Invert the condition if needed.
30128       Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30129                          DAG.getConstant(1, DL, Cond.getValueType()));
30130
30131     // Zero extend the condition if needed.
30132     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
30133
30134     unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30135     return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
30136                        DAG.getConstant(ShAmt, DL, MVT::i8));
30137   }
30138
30139   // Optimize cases that will turn into an LEA instruction.  This requires
30140   // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30141   if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30142     uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
30143     if (N->getValueType(0) == MVT::i32)
30144       Diff = (unsigned)Diff;
30145
30146     bool IsFastMultiplier = false;
30147     if (Diff < 10) {
30148       switch ((unsigned char)Diff) {
30149       default:
30150         break;
30151       case 1: // result = add base, cond
30152       case 2: // result = lea base(    , cond*2)
30153       case 3: // result = lea base(cond, cond*2)
30154       case 4: // result = lea base(    , cond*4)
30155       case 5: // result = lea base(cond, cond*4)
30156       case 8: // result = lea base(    , cond*8)
30157       case 9: // result = lea base(cond, cond*8)
30158         IsFastMultiplier = true;
30159         break;
30160       }
30161     }
30162
30163     if (IsFastMultiplier) {
30164       APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
30165       if (NeedsCondInvert) // Invert the condition if needed.
30166         Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30167                            DAG.getConstant(1, DL, Cond.getValueType()));
30168
30169       // Zero extend the condition if needed.
30170       Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
30171       // Scale the condition by the difference.
30172       if (Diff != 1)
30173         Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30174                            DAG.getConstant(Diff, DL, Cond.getValueType()));
30175
30176       // Add the base if non-zero.
30177       if (FalseC->getAPIntValue() != 0)
30178         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30179                            SDValue(FalseC, 0));
30180       return Cond;
30181     }
30182   }
30183
30184   return SDValue();
30185 }
30186
30187 // If this is a bitcasted op that can be represented as another type, push the
30188 // the bitcast to the inputs. This allows more opportunities for pattern
30189 // matching masked instructions. This is called when we know that the operation
30190 // is used as one of the inputs of a vselect.
30191 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30192                                       TargetLowering::DAGCombinerInfo &DCI) {
30193   // Make sure we have a bitcast.
30194   if (OrigOp.getOpcode() != ISD::BITCAST)
30195     return false;
30196
30197   SDValue Op = OrigOp.getOperand(0);
30198
30199   // If the operation is used by anything other than the bitcast, we shouldn't
30200   // do this combine as that would replicate the operation.
30201   if (!Op.hasOneUse())
30202     return false;
30203
30204   MVT VT = OrigOp.getSimpleValueType();
30205   MVT EltVT = VT.getVectorElementType();
30206   SDLoc DL(Op.getNode());
30207
30208   auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30209                                       SDValue Op2) {
30210     Op0 = DAG.getBitcast(VT, Op0);
30211     DCI.AddToWorklist(Op0.getNode());
30212     Op1 = DAG.getBitcast(VT, Op1);
30213     DCI.AddToWorklist(Op1.getNode());
30214     DCI.CombineTo(OrigOp.getNode(),
30215                   DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30216     return true;
30217   };
30218
30219   unsigned Opcode = Op.getOpcode();
30220   switch (Opcode) {
30221   case X86ISD::PALIGNR:
30222     // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30223     if (!VT.is128BitVector())
30224       return false;
30225     Opcode = X86ISD::VALIGN;
30226     LLVM_FALLTHROUGH;
30227   case X86ISD::VALIGN: {
30228     if (EltVT != MVT::i32 && EltVT != MVT::i64)
30229       return false;
30230     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30231     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30232     unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30233     unsigned EltSize = EltVT.getSizeInBits();
30234     // Make sure we can represent the same shift with the new VT.
30235     if ((ShiftAmt % EltSize) != 0)
30236       return false;
30237     Imm = ShiftAmt / EltSize;
30238     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30239                                     DAG.getConstant(Imm, DL, MVT::i8));
30240   }
30241   case X86ISD::SHUF128: {
30242     if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30243       return false;
30244     // Only change element size, not type.
30245     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30246       return false;
30247     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30248                                     Op.getOperand(2));
30249   }
30250   case ISD::INSERT_SUBVECTOR: {
30251     unsigned EltSize = EltVT.getSizeInBits();
30252     if (EltSize != 32 && EltSize != 64)
30253       return false;
30254     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30255     // Only change element size, not type.
30256     if (EltVT.isInteger() != OpEltVT.isInteger())
30257       return false;
30258     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30259     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30260     SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30261     DCI.AddToWorklist(Op0.getNode());
30262     // Op1 needs to be bitcasted to a smaller vector with the same element type.
30263     SDValue Op1 = Op.getOperand(1);
30264     MVT Op1VT = MVT::getVectorVT(EltVT,
30265                             Op1.getSimpleValueType().getSizeInBits() / EltSize);
30266     Op1 = DAG.getBitcast(Op1VT, Op1);
30267     DCI.AddToWorklist(Op1.getNode());
30268     DCI.CombineTo(OrigOp.getNode(),
30269                   DAG.getNode(Opcode, DL, VT, Op0, Op1,
30270                               DAG.getIntPtrConstant(Imm, DL)));
30271     return true;
30272   }
30273   case ISD::EXTRACT_SUBVECTOR: {
30274     unsigned EltSize = EltVT.getSizeInBits();
30275     if (EltSize != 32 && EltSize != 64)
30276       return false;
30277     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30278     // Only change element size, not type.
30279     if (EltVT.isInteger() != OpEltVT.isInteger())
30280       return false;
30281     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
30282     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30283     // Op0 needs to be bitcasted to a larger vector with the same element type.
30284     SDValue Op0 = Op.getOperand(0);
30285     MVT Op0VT = MVT::getVectorVT(EltVT,
30286                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
30287     Op0 = DAG.getBitcast(Op0VT, Op0);
30288     DCI.AddToWorklist(Op0.getNode());
30289     DCI.CombineTo(OrigOp.getNode(),
30290                   DAG.getNode(Opcode, DL, VT, Op0,
30291                               DAG.getIntPtrConstant(Imm, DL)));
30292     return true;
30293   }
30294   case X86ISD::SUBV_BROADCAST: {
30295     unsigned EltSize = EltVT.getSizeInBits();
30296     if (EltSize != 32 && EltSize != 64)
30297       return false;
30298     // Only change element size, not type.
30299     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30300       return false;
30301     SDValue Op0 = Op.getOperand(0);
30302     MVT Op0VT = MVT::getVectorVT(EltVT,
30303                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
30304     Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30305     DCI.AddToWorklist(Op0.getNode());
30306     DCI.CombineTo(OrigOp.getNode(),
30307                   DAG.getNode(Opcode, DL, VT, Op0));
30308     return true;
30309   }
30310   }
30311
30312   return false;
30313 }
30314
30315 /// Do target-specific dag combines on SELECT and VSELECT nodes.
30316 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30317                              TargetLowering::DAGCombinerInfo &DCI,
30318                              const X86Subtarget &Subtarget) {
30319   SDLoc DL(N);
30320   SDValue Cond = N->getOperand(0);
30321   // Get the LHS/RHS of the select.
30322   SDValue LHS = N->getOperand(1);
30323   SDValue RHS = N->getOperand(2);
30324   EVT VT = LHS.getValueType();
30325   EVT CondVT = Cond.getValueType();
30326   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30327
30328   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
30329   // instructions match the semantics of the common C idiom x<y?x:y but not
30330   // x<=y?x:y, because of how they handle negative zero (which can be
30331   // ignored in unsafe-math mode).
30332   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30333   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
30334       VT != MVT::f80 && VT != MVT::f128 &&
30335       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
30336       (Subtarget.hasSSE2() ||
30337        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
30338     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30339
30340     unsigned Opcode = 0;
30341     // Check for x CC y ? x : y.
30342     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30343         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30344       switch (CC) {
30345       default: break;
30346       case ISD::SETULT:
30347         // Converting this to a min would handle NaNs incorrectly, and swapping
30348         // the operands would cause it to handle comparisons between positive
30349         // and negative zero incorrectly.
30350         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30351           if (!DAG.getTarget().Options.UnsafeFPMath &&
30352               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30353             break;
30354           std::swap(LHS, RHS);
30355         }
30356         Opcode = X86ISD::FMIN;
30357         break;
30358       case ISD::SETOLE:
30359         // Converting this to a min would handle comparisons between positive
30360         // and negative zero incorrectly.
30361         if (!DAG.getTarget().Options.UnsafeFPMath &&
30362             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30363           break;
30364         Opcode = X86ISD::FMIN;
30365         break;
30366       case ISD::SETULE:
30367         // Converting this to a min would handle both negative zeros and NaNs
30368         // incorrectly, but we can swap the operands to fix both.
30369         std::swap(LHS, RHS);
30370         LLVM_FALLTHROUGH;
30371       case ISD::SETOLT:
30372       case ISD::SETLT:
30373       case ISD::SETLE:
30374         Opcode = X86ISD::FMIN;
30375         break;
30376
30377       case ISD::SETOGE:
30378         // Converting this to a max would handle comparisons between positive
30379         // and negative zero incorrectly.
30380         if (!DAG.getTarget().Options.UnsafeFPMath &&
30381             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30382           break;
30383         Opcode = X86ISD::FMAX;
30384         break;
30385       case ISD::SETUGT:
30386         // Converting this to a max would handle NaNs incorrectly, and swapping
30387         // the operands would cause it to handle comparisons between positive
30388         // and negative zero incorrectly.
30389         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30390           if (!DAG.getTarget().Options.UnsafeFPMath &&
30391               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30392             break;
30393           std::swap(LHS, RHS);
30394         }
30395         Opcode = X86ISD::FMAX;
30396         break;
30397       case ISD::SETUGE:
30398         // Converting this to a max would handle both negative zeros and NaNs
30399         // incorrectly, but we can swap the operands to fix both.
30400         std::swap(LHS, RHS);
30401         LLVM_FALLTHROUGH;
30402       case ISD::SETOGT:
30403       case ISD::SETGT:
30404       case ISD::SETGE:
30405         Opcode = X86ISD::FMAX;
30406         break;
30407       }
30408     // Check for x CC y ? y : x -- a min/max with reversed arms.
30409     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30410                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30411       switch (CC) {
30412       default: break;
30413       case ISD::SETOGE:
30414         // Converting this to a min would handle comparisons between positive
30415         // and negative zero incorrectly, and swapping the operands would
30416         // cause it to handle NaNs incorrectly.
30417         if (!DAG.getTarget().Options.UnsafeFPMath &&
30418             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30419           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30420             break;
30421           std::swap(LHS, RHS);
30422         }
30423         Opcode = X86ISD::FMIN;
30424         break;
30425       case ISD::SETUGT:
30426         // Converting this to a min would handle NaNs incorrectly.
30427         if (!DAG.getTarget().Options.UnsafeFPMath &&
30428             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30429           break;
30430         Opcode = X86ISD::FMIN;
30431         break;
30432       case ISD::SETUGE:
30433         // Converting this to a min would handle both negative zeros and NaNs
30434         // incorrectly, but we can swap the operands to fix both.
30435         std::swap(LHS, RHS);
30436         LLVM_FALLTHROUGH;
30437       case ISD::SETOGT:
30438       case ISD::SETGT:
30439       case ISD::SETGE:
30440         Opcode = X86ISD::FMIN;
30441         break;
30442
30443       case ISD::SETULT:
30444         // Converting this to a max would handle NaNs incorrectly.
30445         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30446           break;
30447         Opcode = X86ISD::FMAX;
30448         break;
30449       case ISD::SETOLE:
30450         // Converting this to a max would handle comparisons between positive
30451         // and negative zero incorrectly, and swapping the operands would
30452         // cause it to handle NaNs incorrectly.
30453         if (!DAG.getTarget().Options.UnsafeFPMath &&
30454             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30455           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30456             break;
30457           std::swap(LHS, RHS);
30458         }
30459         Opcode = X86ISD::FMAX;
30460         break;
30461       case ISD::SETULE:
30462         // Converting this to a max would handle both negative zeros and NaNs
30463         // incorrectly, but we can swap the operands to fix both.
30464         std::swap(LHS, RHS);
30465         LLVM_FALLTHROUGH;
30466       case ISD::SETOLT:
30467       case ISD::SETLT:
30468       case ISD::SETLE:
30469         Opcode = X86ISD::FMAX;
30470         break;
30471       }
30472     }
30473
30474     if (Opcode)
30475       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30476   }
30477
30478   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30479   // lowering on KNL. In this case we convert it to
30480   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30481   // The same situation for all 128 and 256-bit vectors of i8 and i16.
30482   // Since SKX these selects have a proper lowering.
30483   if (Subtarget.hasAVX512() && CondVT.isVector() &&
30484       CondVT.getVectorElementType() == MVT::i1 &&
30485       (VT.is128BitVector() || VT.is256BitVector()) &&
30486       (VT.getVectorElementType() == MVT::i8 ||
30487        VT.getVectorElementType() == MVT::i16) &&
30488       !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30489     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30490     DCI.AddToWorklist(Cond.getNode());
30491     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30492   }
30493
30494   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30495     return V;
30496
30497   // Canonicalize max and min:
30498   // (x > y) ? x : y -> (x >= y) ? x : y
30499   // (x < y) ? x : y -> (x <= y) ? x : y
30500   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30501   // the need for an extra compare
30502   // against zero. e.g.
30503   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30504   // subl   %esi, %edi
30505   // testl  %edi, %edi
30506   // movl   $0, %eax
30507   // cmovgl %edi, %eax
30508   // =>
30509   // xorl   %eax, %eax
30510   // subl   %esi, $edi
30511   // cmovsl %eax, %edi
30512   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30513       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30514       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30515     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30516     switch (CC) {
30517     default: break;
30518     case ISD::SETLT:
30519     case ISD::SETGT: {
30520       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30521       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30522                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
30523       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30524     }
30525     }
30526   }
30527
30528   // Early exit check
30529   if (!TLI.isTypeLegal(VT))
30530     return SDValue();
30531
30532   // Match VSELECTs into subs with unsigned saturation.
30533   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30534       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30535       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30536        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30537     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30538
30539     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30540     // left side invert the predicate to simplify logic below.
30541     SDValue Other;
30542     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30543       Other = RHS;
30544       CC = ISD::getSetCCInverse(CC, true);
30545     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30546       Other = LHS;
30547     }
30548
30549     if (Other.getNode() && Other->getNumOperands() == 2 &&
30550         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30551       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30552       SDValue CondRHS = Cond->getOperand(1);
30553
30554       // Look for a general sub with unsigned saturation first.
30555       // x >= y ? x-y : 0 --> subus x, y
30556       // x >  y ? x-y : 0 --> subus x, y
30557       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30558           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30559         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30560
30561       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30562         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30563           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30564             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30565               // If the RHS is a constant we have to reverse the const
30566               // canonicalization.
30567               // x > C-1 ? x+-C : 0 --> subus x, C
30568               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30569                   CondRHSConst->getAPIntValue() ==
30570                       (-OpRHSConst->getAPIntValue() - 1))
30571                 return DAG.getNode(
30572                     X86ISD::SUBUS, DL, VT, OpLHS,
30573                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30574
30575           // Another special case: If C was a sign bit, the sub has been
30576           // canonicalized into a xor.
30577           // FIXME: Would it be better to use computeKnownBits to determine
30578           //        whether it's safe to decanonicalize the xor?
30579           // x s< 0 ? x^C : 0 --> subus x, C
30580           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30581               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30582               OpRHSConst->getAPIntValue().isSignMask())
30583             // Note that we have to rebuild the RHS constant here to ensure we
30584             // don't rely on particular values of undef lanes.
30585             return DAG.getNode(
30586                 X86ISD::SUBUS, DL, VT, OpLHS,
30587                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30588         }
30589     }
30590   }
30591
30592   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30593     return V;
30594
30595   // If this is a *dynamic* select (non-constant condition) and we can match
30596   // this node with one of the variable blend instructions, restructure the
30597   // condition so that blends can use the high (sign) bit of each element and
30598   // use SimplifyDemandedBits to simplify the condition operand.
30599   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30600       !DCI.isBeforeLegalize() &&
30601       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30602     unsigned BitWidth = Cond.getScalarValueSizeInBits();
30603
30604     // Don't optimize vector selects that map to mask-registers.
30605     if (BitWidth == 1)
30606       return SDValue();
30607
30608     // We can only handle the cases where VSELECT is directly legal on the
30609     // subtarget. We custom lower VSELECT nodes with constant conditions and
30610     // this makes it hard to see whether a dynamic VSELECT will correctly
30611     // lower, so we both check the operation's status and explicitly handle the
30612     // cases where a *dynamic* blend will fail even though a constant-condition
30613     // blend could be custom lowered.
30614     // FIXME: We should find a better way to handle this class of problems.
30615     // Potentially, we should combine constant-condition vselect nodes
30616     // pre-legalization into shuffles and not mark as many types as custom
30617     // lowered.
30618     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30619       return SDValue();
30620     // FIXME: We don't support i16-element blends currently. We could and
30621     // should support them by making *all* the bits in the condition be set
30622     // rather than just the high bit and using an i8-element blend.
30623     if (VT.getVectorElementType() == MVT::i16)
30624       return SDValue();
30625     // Dynamic blending was only available from SSE4.1 onward.
30626     if (VT.is128BitVector() && !Subtarget.hasSSE41())
30627       return SDValue();
30628     // Byte blends are only available in AVX2
30629     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30630       return SDValue();
30631
30632     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30633     APInt DemandedMask(APInt::getSignMask(BitWidth));
30634     KnownBits Known;
30635     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
30636                                           !DCI.isBeforeLegalizeOps());
30637     if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30638         TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30639       // If we changed the computation somewhere in the DAG, this change will
30640       // affect all users of Cond. Make sure it is fine and update all the nodes
30641       // so that we do not use the generic VSELECT anymore. Otherwise, we may
30642       // perform wrong optimizations as we messed with the actual expectation
30643       // for the vector boolean values.
30644       if (Cond != TLO.Old) {
30645         // Check all uses of the condition operand to check whether it will be
30646         // consumed by non-BLEND instructions. Those may require that all bits
30647         // are set properly.
30648         for (SDNode *U : Cond->uses()) {
30649           // TODO: Add other opcodes eventually lowered into BLEND.
30650           if (U->getOpcode() != ISD::VSELECT)
30651             return SDValue();
30652         }
30653
30654         // Update all users of the condition before committing the change, so
30655         // that the VSELECT optimizations that expect the correct vector boolean
30656         // value will not be triggered.
30657         for (SDNode *U : Cond->uses()) {
30658           SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30659                                    U->getValueType(0), Cond, U->getOperand(1),
30660                                    U->getOperand(2));
30661           DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30662         }
30663         DCI.CommitTargetLoweringOpt(TLO);
30664         return SDValue();
30665       }
30666       // Only Cond (rather than other nodes in the computation chain) was
30667       // changed. Change the condition just for N to keep the opportunity to
30668       // optimize all other users their own way.
30669       SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30670       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30671       return SDValue();
30672     }
30673   }
30674
30675   // Look for vselects with LHS/RHS being bitcasted from an operation that
30676   // can be executed on another type. Push the bitcast to the inputs of
30677   // the operation. This exposes opportunities for using masking instructions.
30678   if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30679       CondVT.getVectorElementType() == MVT::i1) {
30680     if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30681       return SDValue(N, 0);
30682     if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30683       return SDValue(N, 0);
30684   }
30685
30686   // Custom action for SELECT MMX
30687   if (VT == MVT::x86mmx) {
30688     LHS = DAG.getBitcast(MVT::i64, LHS);
30689     RHS = DAG.getBitcast(MVT::i64, RHS);
30690     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
30691     return DAG.getBitcast(VT, newSelect);
30692   }
30693
30694   return SDValue();
30695 }
30696
30697 /// Combine:
30698 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30699 /// to:
30700 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30701 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30702 /// Note that this is only legal for some op/cc combinations.
30703 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30704                                        SelectionDAG &DAG) {
30705   // This combine only operates on CMP-like nodes.
30706   if (!(Cmp.getOpcode() == X86ISD::CMP ||
30707         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30708     return SDValue();
30709
30710   // Can't replace the cmp if it has more uses than the one we're looking at.
30711   // FIXME: We would like to be able to handle this, but would need to make sure
30712   // all uses were updated.
30713   if (!Cmp.hasOneUse())
30714     return SDValue();
30715
30716   // This only applies to variations of the common case:
30717   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30718   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30719   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30720   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30721   // Using the proper condcodes (see below), overflow is checked for.
30722
30723   // FIXME: We can generalize both constraints:
30724   // - XOR/OR/AND (if they were made to survive AtomicExpand)
30725   // - LHS != 1
30726   // if the result is compared.
30727
30728   SDValue CmpLHS = Cmp.getOperand(0);
30729   SDValue CmpRHS = Cmp.getOperand(1);
30730
30731   if (!CmpLHS.hasOneUse())
30732     return SDValue();
30733
30734   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30735   if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30736     return SDValue();
30737
30738   const unsigned Opc = CmpLHS.getOpcode();
30739
30740   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30741     return SDValue();
30742
30743   SDValue OpRHS = CmpLHS.getOperand(2);
30744   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30745   if (!OpRHSC)
30746     return SDValue();
30747
30748   APInt Addend = OpRHSC->getAPIntValue();
30749   if (Opc == ISD::ATOMIC_LOAD_SUB)
30750     Addend = -Addend;
30751
30752   if (CC == X86::COND_S && Addend == 1)
30753     CC = X86::COND_LE;
30754   else if (CC == X86::COND_NS && Addend == 1)
30755     CC = X86::COND_G;
30756   else if (CC == X86::COND_G && Addend == -1)
30757     CC = X86::COND_GE;
30758   else if (CC == X86::COND_LE && Addend == -1)
30759     CC = X86::COND_L;
30760   else
30761     return SDValue();
30762
30763   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30764   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30765                                 DAG.getUNDEF(CmpLHS.getValueType()));
30766   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30767   return LockOp;
30768 }
30769
30770 // Check whether a boolean test is testing a boolean value generated by
30771 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30772 // code.
30773 //
30774 // Simplify the following patterns:
30775 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30776 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30777 // to (Op EFLAGS Cond)
30778 //
30779 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30780 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30781 // to (Op EFLAGS !Cond)
30782 //
30783 // where Op could be BRCOND or CMOV.
30784 //
30785 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30786   // This combine only operates on CMP-like nodes.
30787   if (!(Cmp.getOpcode() == X86ISD::CMP ||
30788         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30789     return SDValue();
30790
30791   // Quit if not used as a boolean value.
30792   if (CC != X86::COND_E && CC != X86::COND_NE)
30793     return SDValue();
30794
30795   // Check CMP operands. One of them should be 0 or 1 and the other should be
30796   // an SetCC or extended from it.
30797   SDValue Op1 = Cmp.getOperand(0);
30798   SDValue Op2 = Cmp.getOperand(1);
30799
30800   SDValue SetCC;
30801   const ConstantSDNode* C = nullptr;
30802   bool needOppositeCond = (CC == X86::COND_E);
30803   bool checkAgainstTrue = false; // Is it a comparison against 1?
30804
30805   if ((C = dyn_cast<ConstantSDNode>(Op1)))
30806     SetCC = Op2;
30807   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30808     SetCC = Op1;
30809   else // Quit if all operands are not constants.
30810     return SDValue();
30811
30812   if (C->getZExtValue() == 1) {
30813     needOppositeCond = !needOppositeCond;
30814     checkAgainstTrue = true;
30815   } else if (C->getZExtValue() != 0)
30816     // Quit if the constant is neither 0 or 1.
30817     return SDValue();
30818
30819   bool truncatedToBoolWithAnd = false;
30820   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30821   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30822          SetCC.getOpcode() == ISD::TRUNCATE ||
30823          SetCC.getOpcode() == ISD::AND) {
30824     if (SetCC.getOpcode() == ISD::AND) {
30825       int OpIdx = -1;
30826       if (isOneConstant(SetCC.getOperand(0)))
30827         OpIdx = 1;
30828       if (isOneConstant(SetCC.getOperand(1)))
30829         OpIdx = 0;
30830       if (OpIdx < 0)
30831         break;
30832       SetCC = SetCC.getOperand(OpIdx);
30833       truncatedToBoolWithAnd = true;
30834     } else
30835       SetCC = SetCC.getOperand(0);
30836   }
30837
30838   switch (SetCC.getOpcode()) {
30839   case X86ISD::SETCC_CARRY:
30840     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30841     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30842     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30843     // truncated to i1 using 'and'.
30844     if (checkAgainstTrue && !truncatedToBoolWithAnd)
30845       break;
30846     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30847            "Invalid use of SETCC_CARRY!");
30848     LLVM_FALLTHROUGH;
30849   case X86ISD::SETCC:
30850     // Set the condition code or opposite one if necessary.
30851     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30852     if (needOppositeCond)
30853       CC = X86::GetOppositeBranchCondition(CC);
30854     return SetCC.getOperand(1);
30855   case X86ISD::CMOV: {
30856     // Check whether false/true value has canonical one, i.e. 0 or 1.
30857     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30858     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30859     // Quit if true value is not a constant.
30860     if (!TVal)
30861       return SDValue();
30862     // Quit if false value is not a constant.
30863     if (!FVal) {
30864       SDValue Op = SetCC.getOperand(0);
30865       // Skip 'zext' or 'trunc' node.
30866       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30867           Op.getOpcode() == ISD::TRUNCATE)
30868         Op = Op.getOperand(0);
30869       // A special case for rdrand/rdseed, where 0 is set if false cond is
30870       // found.
30871       if ((Op.getOpcode() != X86ISD::RDRAND &&
30872            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30873         return SDValue();
30874     }
30875     // Quit if false value is not the constant 0 or 1.
30876     bool FValIsFalse = true;
30877     if (FVal && FVal->getZExtValue() != 0) {
30878       if (FVal->getZExtValue() != 1)
30879         return SDValue();
30880       // If FVal is 1, opposite cond is needed.
30881       needOppositeCond = !needOppositeCond;
30882       FValIsFalse = false;
30883     }
30884     // Quit if TVal is not the constant opposite of FVal.
30885     if (FValIsFalse && TVal->getZExtValue() != 1)
30886       return SDValue();
30887     if (!FValIsFalse && TVal->getZExtValue() != 0)
30888       return SDValue();
30889     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30890     if (needOppositeCond)
30891       CC = X86::GetOppositeBranchCondition(CC);
30892     return SetCC.getOperand(3);
30893   }
30894   }
30895
30896   return SDValue();
30897 }
30898
30899 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30900 /// Match:
30901 ///   (X86or (X86setcc) (X86setcc))
30902 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
30903 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30904                                            X86::CondCode &CC1, SDValue &Flags,
30905                                            bool &isAnd) {
30906   if (Cond->getOpcode() == X86ISD::CMP) {
30907     if (!isNullConstant(Cond->getOperand(1)))
30908       return false;
30909
30910     Cond = Cond->getOperand(0);
30911   }
30912
30913   isAnd = false;
30914
30915   SDValue SetCC0, SetCC1;
30916   switch (Cond->getOpcode()) {
30917   default: return false;
30918   case ISD::AND:
30919   case X86ISD::AND:
30920     isAnd = true;
30921     LLVM_FALLTHROUGH;
30922   case ISD::OR:
30923   case X86ISD::OR:
30924     SetCC0 = Cond->getOperand(0);
30925     SetCC1 = Cond->getOperand(1);
30926     break;
30927   };
30928
30929   // Make sure we have SETCC nodes, using the same flags value.
30930   if (SetCC0.getOpcode() != X86ISD::SETCC ||
30931       SetCC1.getOpcode() != X86ISD::SETCC ||
30932       SetCC0->getOperand(1) != SetCC1->getOperand(1))
30933     return false;
30934
30935   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30936   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30937   Flags = SetCC0->getOperand(1);
30938   return true;
30939 }
30940
30941 /// Optimize an EFLAGS definition used according to the condition code \p CC
30942 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30943 /// uses of chain values.
30944 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30945                                   SelectionDAG &DAG) {
30946   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30947     return R;
30948   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30949 }
30950
30951 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30952 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30953                            TargetLowering::DAGCombinerInfo &DCI,
30954                            const X86Subtarget &Subtarget) {
30955   SDLoc DL(N);
30956
30957   // If the flag operand isn't dead, don't touch this CMOV.
30958   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30959     return SDValue();
30960
30961   SDValue FalseOp = N->getOperand(0);
30962   SDValue TrueOp = N->getOperand(1);
30963   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30964   SDValue Cond = N->getOperand(3);
30965
30966   if (CC == X86::COND_E || CC == X86::COND_NE) {
30967     switch (Cond.getOpcode()) {
30968     default: break;
30969     case X86ISD::BSR:
30970     case X86ISD::BSF:
30971       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30972       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30973         return (CC == X86::COND_E) ? FalseOp : TrueOp;
30974     }
30975   }
30976
30977   // Try to simplify the EFLAGS and condition code operands.
30978   // We can't always do this as FCMOV only supports a subset of X86 cond.
30979   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30980     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30981       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30982         Flags};
30983       return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30984     }
30985   }
30986
30987   // If this is a select between two integer constants, try to do some
30988   // optimizations.  Note that the operands are ordered the opposite of SELECT
30989   // operands.
30990   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30991     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30992       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30993       // larger than FalseC (the false value).
30994       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30995         CC = X86::GetOppositeBranchCondition(CC);
30996         std::swap(TrueC, FalseC);
30997         std::swap(TrueOp, FalseOp);
30998       }
30999
31000       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
31001       // This is efficient for any integer data type (including i8/i16) and
31002       // shift amount.
31003       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31004         Cond = getSETCC(CC, Cond, DL, DAG);
31005
31006         // Zero extend the condition if needed.
31007         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31008
31009         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31010         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31011                            DAG.getConstant(ShAmt, DL, MVT::i8));
31012         if (N->getNumValues() == 2)  // Dead flag value?
31013           return DCI.CombineTo(N, Cond, SDValue());
31014         return Cond;
31015       }
31016
31017       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
31018       // for any integer data type, including i8/i16.
31019       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31020         Cond = getSETCC(CC, Cond, DL, DAG);
31021
31022         // Zero extend the condition if needed.
31023         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31024                            FalseC->getValueType(0), Cond);
31025         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31026                            SDValue(FalseC, 0));
31027
31028         if (N->getNumValues() == 2)  // Dead flag value?
31029           return DCI.CombineTo(N, Cond, SDValue());
31030         return Cond;
31031       }
31032
31033       // Optimize cases that will turn into an LEA instruction.  This requires
31034       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31035       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31036         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31037         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31038
31039         bool isFastMultiplier = false;
31040         if (Diff < 10) {
31041           switch ((unsigned char)Diff) {
31042           default: break;
31043           case 1:  // result = add base, cond
31044           case 2:  // result = lea base(    , cond*2)
31045           case 3:  // result = lea base(cond, cond*2)
31046           case 4:  // result = lea base(    , cond*4)
31047           case 5:  // result = lea base(cond, cond*4)
31048           case 8:  // result = lea base(    , cond*8)
31049           case 9:  // result = lea base(cond, cond*8)
31050             isFastMultiplier = true;
31051             break;
31052           }
31053         }
31054
31055         if (isFastMultiplier) {
31056           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31057           Cond = getSETCC(CC, Cond, DL ,DAG);
31058           // Zero extend the condition if needed.
31059           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31060                              Cond);
31061           // Scale the condition by the difference.
31062           if (Diff != 1)
31063             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31064                                DAG.getConstant(Diff, DL, Cond.getValueType()));
31065
31066           // Add the base if non-zero.
31067           if (FalseC->getAPIntValue() != 0)
31068             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31069                                SDValue(FalseC, 0));
31070           if (N->getNumValues() == 2)  // Dead flag value?
31071             return DCI.CombineTo(N, Cond, SDValue());
31072           return Cond;
31073         }
31074       }
31075     }
31076   }
31077
31078   // Handle these cases:
31079   //   (select (x != c), e, c) -> select (x != c), e, x),
31080   //   (select (x == c), c, e) -> select (x == c), x, e)
31081   // where the c is an integer constant, and the "select" is the combination
31082   // of CMOV and CMP.
31083   //
31084   // The rationale for this change is that the conditional-move from a constant
31085   // needs two instructions, however, conditional-move from a register needs
31086   // only one instruction.
31087   //
31088   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31089   //  some instruction-combining opportunities. This opt needs to be
31090   //  postponed as late as possible.
31091   //
31092   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31093     // the DCI.xxxx conditions are provided to postpone the optimization as
31094     // late as possible.
31095
31096     ConstantSDNode *CmpAgainst = nullptr;
31097     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31098         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31099         !isa<ConstantSDNode>(Cond.getOperand(0))) {
31100
31101       if (CC == X86::COND_NE &&
31102           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31103         CC = X86::GetOppositeBranchCondition(CC);
31104         std::swap(TrueOp, FalseOp);
31105       }
31106
31107       if (CC == X86::COND_E &&
31108           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31109         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31110                           DAG.getConstant(CC, DL, MVT::i8), Cond };
31111         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
31112       }
31113     }
31114   }
31115
31116   // Fold and/or of setcc's to double CMOV:
31117   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31118   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31119   //
31120   // This combine lets us generate:
31121   //   cmovcc1 (jcc1 if we don't have CMOV)
31122   //   cmovcc2 (same)
31123   // instead of:
31124   //   setcc1
31125   //   setcc2
31126   //   and/or
31127   //   cmovne (jne if we don't have CMOV)
31128   // When we can't use the CMOV instruction, it might increase branch
31129   // mispredicts.
31130   // When we can use CMOV, or when there is no mispredict, this improves
31131   // throughput and reduces register pressure.
31132   //
31133   if (CC == X86::COND_NE) {
31134     SDValue Flags;
31135     X86::CondCode CC0, CC1;
31136     bool isAndSetCC;
31137     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31138       if (isAndSetCC) {
31139         std::swap(FalseOp, TrueOp);
31140         CC0 = X86::GetOppositeBranchCondition(CC0);
31141         CC1 = X86::GetOppositeBranchCondition(CC1);
31142       }
31143
31144       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31145         Flags};
31146       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31147       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31148       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31149       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31150       return CMOV;
31151     }
31152   }
31153
31154   return SDValue();
31155 }
31156
31157 /// Different mul shrinking modes.
31158 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31159
31160 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31161   EVT VT = N->getOperand(0).getValueType();
31162   if (VT.getScalarSizeInBits() != 32)
31163     return false;
31164
31165   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31166   unsigned SignBits[2] = {1, 1};
31167   bool IsPositive[2] = {false, false};
31168   for (unsigned i = 0; i < 2; i++) {
31169     SDValue Opd = N->getOperand(i);
31170
31171     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31172     // compute signbits for it separately.
31173     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31174       // For anyextend, it is safe to assume an appropriate number of leading
31175       // sign/zero bits.
31176       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31177         SignBits[i] = 25;
31178       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31179                MVT::i16)
31180         SignBits[i] = 17;
31181       else
31182         return false;
31183       IsPositive[i] = true;
31184     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31185       // All the operands of BUILD_VECTOR need to be int constant.
31186       // Find the smallest value range which all the operands belong to.
31187       SignBits[i] = 32;
31188       IsPositive[i] = true;
31189       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31190         if (SubOp.isUndef())
31191           continue;
31192         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31193         if (!CN)
31194           return false;
31195         APInt IntVal = CN->getAPIntValue();
31196         if (IntVal.isNegative())
31197           IsPositive[i] = false;
31198         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31199       }
31200     } else {
31201       SignBits[i] = DAG.ComputeNumSignBits(Opd);
31202       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31203         IsPositive[i] = true;
31204     }
31205   }
31206
31207   bool AllPositive = IsPositive[0] && IsPositive[1];
31208   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31209   // When ranges are from -128 ~ 127, use MULS8 mode.
31210   if (MinSignBits >= 25)
31211     Mode = MULS8;
31212   // When ranges are from 0 ~ 255, use MULU8 mode.
31213   else if (AllPositive && MinSignBits >= 24)
31214     Mode = MULU8;
31215   // When ranges are from -32768 ~ 32767, use MULS16 mode.
31216   else if (MinSignBits >= 17)
31217     Mode = MULS16;
31218   // When ranges are from 0 ~ 65535, use MULU16 mode.
31219   else if (AllPositive && MinSignBits >= 16)
31220     Mode = MULU16;
31221   else
31222     return false;
31223   return true;
31224 }
31225
31226 /// When the operands of vector mul are extended from smaller size values,
31227 /// like i8 and i16, the type of mul may be shrinked to generate more
31228 /// efficient code. Two typical patterns are handled:
31229 /// Pattern1:
31230 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
31231 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
31232 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31233 ///     %5 = mul <N x i32> %2, %4
31234 ///
31235 /// Pattern2:
31236 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
31237 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
31238 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31239 ///     %5 = mul <N x i32> %2, %4
31240 ///
31241 /// There are four mul shrinking modes:
31242 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31243 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31244 /// generate pmullw+sext32 for it (MULS8 mode).
31245 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31246 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31247 /// generate pmullw+zext32 for it (MULU8 mode).
31248 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31249 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31250 /// generate pmullw+pmulhw for it (MULS16 mode).
31251 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31252 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31253 /// generate pmullw+pmulhuw for it (MULU16 mode).
31254 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31255                                const X86Subtarget &Subtarget) {
31256   // Check for legality
31257   // pmullw/pmulhw are not supported by SSE.
31258   if (!Subtarget.hasSSE2())
31259     return SDValue();
31260
31261   // Check for profitability
31262   // pmulld is supported since SSE41. It is better to use pmulld
31263   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31264   // the expansion.
31265   bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31266   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
31267     return SDValue();
31268
31269   ShrinkMode Mode;
31270   if (!canReduceVMulWidth(N, DAG, Mode))
31271     return SDValue();
31272
31273   SDLoc DL(N);
31274   SDValue N0 = N->getOperand(0);
31275   SDValue N1 = N->getOperand(1);
31276   EVT VT = N->getOperand(0).getValueType();
31277   unsigned RegSize = 128;
31278   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31279   EVT ReducedVT =
31280       EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31281   // Shrink the operands of mul.
31282   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31283   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31284
31285   if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
31286     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31287     // lower part is needed.
31288     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31289     if (Mode == MULU8 || Mode == MULS8) {
31290       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
31291                          DL, VT, MulLo);
31292     } else {
31293       MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31294       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31295       // the higher part is also needed.
31296       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31297                                   ReducedVT, NewN0, NewN1);
31298
31299       // Repack the lower part and higher part result of mul into a wider
31300       // result.
31301       // Generate shuffle functioning as punpcklwd.
31302       SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31303       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31304         ShuffleMask[2 * i] = i;
31305         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31306       }
31307       SDValue ResLo =
31308           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31309       ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31310       // Generate shuffle functioning as punpckhwd.
31311       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31312         ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31313         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31314       }
31315       SDValue ResHi =
31316           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31317       ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31318       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31319     }
31320   } else {
31321     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31322     // to legalize the mul explicitly because implicit legalization for type
31323     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31324     // instructions which will not exist when we explicitly legalize it by
31325     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31326     // <4 x i16> undef).
31327     //
31328     // Legalize the operands of mul.
31329     // FIXME: We may be able to handle non-concatenated vectors by insertion.
31330     unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31331     if ((RegSize % ReducedSizeInBits) != 0)
31332       return SDValue();
31333
31334     SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31335                                  DAG.getUNDEF(ReducedVT));
31336     Ops[0] = NewN0;
31337     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31338     Ops[0] = NewN1;
31339     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31340
31341     if (Mode == MULU8 || Mode == MULS8) {
31342       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31343       // part is needed.
31344       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31345
31346       // convert the type of mul result to VT.
31347       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31348       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31349                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
31350                                 DL, ResVT, Mul);
31351       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31352                          DAG.getIntPtrConstant(0, DL));
31353     } else {
31354       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31355       // MULU16/MULS16, both parts are needed.
31356       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31357       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31358                                   OpsVT, NewN0, NewN1);
31359
31360       // Repack the lower part and higher part result of mul into a wider
31361       // result. Make sure the type of mul result is VT.
31362       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31363       SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31364       Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31365       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31366                          DAG.getIntPtrConstant(0, DL));
31367     }
31368   }
31369 }
31370
31371 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31372                                  EVT VT, SDLoc DL) {
31373
31374   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31375     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31376                                  DAG.getConstant(Mult, DL, VT));
31377     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31378                          DAG.getConstant(Shift, DL, MVT::i8));
31379     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31380                          N->getOperand(0));
31381     return Result;
31382   };
31383
31384   auto combineMulMulAddOrSub = [&](bool isAdd) {
31385     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31386                                  DAG.getConstant(9, DL, VT));
31387     Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31388     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31389                          N->getOperand(0));
31390     return Result;
31391   };
31392
31393   switch (MulAmt) {
31394   default:
31395     break;
31396   case 11:
31397     // mul x, 11 => add ((shl (mul x, 5), 1), x)
31398     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
31399   case 21:
31400     // mul x, 21 => add ((shl (mul x, 5), 2), x)
31401     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
31402   case 22:
31403     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31404     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31405                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
31406   case 19:
31407     // mul x, 19 => sub ((shl (mul x, 5), 2), x)
31408     return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
31409   case 13:
31410     // mul x, 13 => add ((shl (mul x, 3), 2), x)
31411     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
31412   case 23:
31413     // mul x, 13 => sub ((shl (mul x, 3), 3), x)
31414     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
31415   case 14:
31416     // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31417     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31418                        combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
31419   case 26:
31420     // mul x, 26 => sub ((mul (mul x, 9), 3), x)
31421     return combineMulMulAddOrSub(/*isAdd*/ false);
31422   case 28:
31423     // mul x, 28 => add ((mul (mul x, 9), 3), x)
31424     return combineMulMulAddOrSub(/*isAdd*/ true);
31425   case 29:
31426     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31427     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31428                        combineMulMulAddOrSub(/*isAdd*/ true));
31429   case 30:
31430     // mul x, 30 => sub (sub ((shl x, 5), x), x)
31431     return DAG.getNode(
31432         ISD::SUB, DL, VT,
31433         DAG.getNode(ISD::SUB, DL, VT,
31434                     DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31435                                 DAG.getConstant(5, DL, MVT::i8)),
31436                     N->getOperand(0)),
31437         N->getOperand(0));
31438   }
31439   return SDValue();
31440 }
31441
31442 /// Optimize a single multiply with constant into two operations in order to
31443 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31444 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31445                           TargetLowering::DAGCombinerInfo &DCI,
31446                           const X86Subtarget &Subtarget) {
31447   EVT VT = N->getValueType(0);
31448   if (DCI.isBeforeLegalize() && VT.isVector())
31449     return reduceVMULWidth(N, DAG, Subtarget);
31450
31451   if (!MulConstantOptimization)
31452     return SDValue();
31453   // An imul is usually smaller than the alternative sequence.
31454   if (DAG.getMachineFunction().getFunction()->optForMinSize())
31455     return SDValue();
31456
31457   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31458     return SDValue();
31459
31460   if (VT != MVT::i64 && VT != MVT::i32)
31461     return SDValue();
31462
31463   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31464   if (!C)
31465     return SDValue();
31466   uint64_t MulAmt = C->getZExtValue();
31467   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31468     return SDValue();
31469
31470   uint64_t MulAmt1 = 0;
31471   uint64_t MulAmt2 = 0;
31472   if ((MulAmt % 9) == 0) {
31473     MulAmt1 = 9;
31474     MulAmt2 = MulAmt / 9;
31475   } else if ((MulAmt % 5) == 0) {
31476     MulAmt1 = 5;
31477     MulAmt2 = MulAmt / 5;
31478   } else if ((MulAmt % 3) == 0) {
31479     MulAmt1 = 3;
31480     MulAmt2 = MulAmt / 3;
31481   }
31482
31483   SDLoc DL(N);
31484   SDValue NewMul;
31485   if (MulAmt2 &&
31486       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31487
31488     if (isPowerOf2_64(MulAmt2) &&
31489         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31490       // If second multiplifer is pow2, issue it first. We want the multiply by
31491       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31492       // is an add.
31493       std::swap(MulAmt1, MulAmt2);
31494
31495     if (isPowerOf2_64(MulAmt1))
31496       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31497                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31498     else
31499       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31500                            DAG.getConstant(MulAmt1, DL, VT));
31501
31502     if (isPowerOf2_64(MulAmt2))
31503       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31504                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31505     else
31506       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31507                            DAG.getConstant(MulAmt2, DL, VT));
31508   } else if (!Subtarget.slowLEA())
31509     NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31510
31511   if (!NewMul) {
31512     assert(MulAmt != 0 &&
31513            MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31514            "Both cases that could cause potential overflows should have "
31515            "already been handled.");
31516     int64_t SignMulAmt = C->getSExtValue();
31517     if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31518         (SignMulAmt != -INT64_MAX)) {
31519       int NumSign = SignMulAmt > 0 ? 1 : -1;
31520       bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31521       bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31522       if (IsPowerOf2_64PlusOne) {
31523         // (mul x, 2^N + 1) => (add (shl x, N), x)
31524         NewMul = DAG.getNode(
31525             ISD::ADD, DL, VT, N->getOperand(0),
31526             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31527                         DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31528                                         MVT::i8)));
31529       } else if (IsPowerOf2_64MinusOne) {
31530         // (mul x, 2^N - 1) => (sub (shl x, N), x)
31531         NewMul = DAG.getNode(
31532             ISD::SUB, DL, VT,
31533             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31534                         DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31535                                         MVT::i8)),
31536             N->getOperand(0));
31537       }
31538       // To negate, subtract the number from zero
31539       if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31540         NewMul =
31541             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31542     }
31543   }
31544
31545   if (NewMul)
31546     // Do not add new nodes to DAG combiner worklist.
31547     DCI.CombineTo(N, NewMul, false);
31548
31549   return SDValue();
31550 }
31551
31552 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31553   SDValue N0 = N->getOperand(0);
31554   SDValue N1 = N->getOperand(1);
31555   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31556   EVT VT = N0.getValueType();
31557
31558   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31559   // since the result of setcc_c is all zero's or all ones.
31560   if (VT.isInteger() && !VT.isVector() &&
31561       N1C && N0.getOpcode() == ISD::AND &&
31562       N0.getOperand(1).getOpcode() == ISD::Constant) {
31563     SDValue N00 = N0.getOperand(0);
31564     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31565     Mask <<= N1C->getAPIntValue();
31566     bool MaskOK = false;
31567     // We can handle cases concerning bit-widening nodes containing setcc_c if
31568     // we carefully interrogate the mask to make sure we are semantics
31569     // preserving.
31570     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31571     // of the underlying setcc_c operation if the setcc_c was zero extended.
31572     // Consider the following example:
31573     //   zext(setcc_c)                 -> i32 0x0000FFFF
31574     //   c1                            -> i32 0x0000FFFF
31575     //   c2                            -> i32 0x00000001
31576     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31577     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
31578     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31579       MaskOK = true;
31580     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31581                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31582       MaskOK = true;
31583     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31584                 N00.getOpcode() == ISD::ANY_EXTEND) &&
31585                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31586       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31587     }
31588     if (MaskOK && Mask != 0) {
31589       SDLoc DL(N);
31590       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31591     }
31592   }
31593
31594   // Hardware support for vector shifts is sparse which makes us scalarize the
31595   // vector operations in many cases. Also, on sandybridge ADD is faster than
31596   // shl.
31597   // (shl V, 1) -> add V,V
31598   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31599     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31600       assert(N0.getValueType().isVector() && "Invalid vector shift type");
31601       // We shift all of the values by one. In many cases we do not have
31602       // hardware support for this operation. This is better expressed as an ADD
31603       // of two values.
31604       if (N1SplatC->getAPIntValue() == 1)
31605         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31606     }
31607
31608   return SDValue();
31609 }
31610
31611 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31612   SDValue N0 = N->getOperand(0);
31613   SDValue N1 = N->getOperand(1);
31614   EVT VT = N0.getValueType();
31615   unsigned Size = VT.getSizeInBits();
31616
31617   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31618   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31619   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31620   // depending on sign of (SarConst - [56,48,32,24,16])
31621
31622   // sexts in X86 are MOVs. The MOVs have the same code size
31623   // as above SHIFTs (only SHIFT on 1 has lower code size).
31624   // However the MOVs have 2 advantages to a SHIFT:
31625   // 1. MOVs can write to a register that differs from source
31626   // 2. MOVs accept memory operands
31627
31628   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31629       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31630       N0.getOperand(1).getOpcode() != ISD::Constant)
31631     return SDValue();
31632
31633   SDValue N00 = N0.getOperand(0);
31634   SDValue N01 = N0.getOperand(1);
31635   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31636   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31637   EVT CVT = N1.getValueType();
31638
31639   if (SarConst.isNegative())
31640     return SDValue();
31641
31642   for (MVT SVT : MVT::integer_valuetypes()) {
31643     unsigned ShiftSize = SVT.getSizeInBits();
31644     // skipping types without corresponding sext/zext and
31645     // ShlConst that is not one of [56,48,32,24,16]
31646     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31647       continue;
31648     SDLoc DL(N);
31649     SDValue NN =
31650         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31651     SarConst = SarConst - (Size - ShiftSize);
31652     if (SarConst == 0)
31653       return NN;
31654     else if (SarConst.isNegative())
31655       return DAG.getNode(ISD::SHL, DL, VT, NN,
31656                          DAG.getConstant(-SarConst, DL, CVT));
31657     else
31658       return DAG.getNode(ISD::SRA, DL, VT, NN,
31659                          DAG.getConstant(SarConst, DL, CVT));
31660   }
31661   return SDValue();
31662 }
31663
31664 /// \brief Returns a vector of 0s if the node in input is a vector logical
31665 /// shift by a constant amount which is known to be bigger than or equal
31666 /// to the vector element size in bits.
31667 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31668                                       const X86Subtarget &Subtarget) {
31669   EVT VT = N->getValueType(0);
31670
31671   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31672       (!Subtarget.hasInt256() ||
31673        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31674     return SDValue();
31675
31676   SDValue Amt = N->getOperand(1);
31677   SDLoc DL(N);
31678   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31679     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31680       const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31681       unsigned MaxAmount =
31682         VT.getSimpleVT().getScalarSizeInBits();
31683
31684       // SSE2/AVX2 logical shifts always return a vector of 0s
31685       // if the shift amount is bigger than or equal to
31686       // the element size. The constant shift amount will be
31687       // encoded as a 8-bit immediate.
31688       if (ShiftAmt.trunc(8).uge(MaxAmount))
31689         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31690     }
31691
31692   return SDValue();
31693 }
31694
31695 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31696                             TargetLowering::DAGCombinerInfo &DCI,
31697                             const X86Subtarget &Subtarget) {
31698   if (N->getOpcode() == ISD::SHL)
31699     if (SDValue V = combineShiftLeft(N, DAG))
31700       return V;
31701
31702   if (N->getOpcode() == ISD::SRA)
31703     if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31704       return V;
31705
31706   // Try to fold this logical shift into a zero vector.
31707   if (N->getOpcode() != ISD::SRA)
31708     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31709       return V;
31710
31711   return SDValue();
31712 }
31713
31714 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31715                                      TargetLowering::DAGCombinerInfo &DCI,
31716                                      const X86Subtarget &Subtarget) {
31717   unsigned Opcode = N->getOpcode();
31718   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31719           X86ISD::VSRLI == Opcode) &&
31720          "Unexpected shift opcode");
31721   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31722   EVT VT = N->getValueType(0);
31723   SDValue N0 = N->getOperand(0);
31724   SDValue N1 = N->getOperand(1);
31725   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31726   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31727          "Unexpected value type");
31728
31729   // Out of range logical bit shifts are guaranteed to be zero.
31730   // Out of range arithmetic bit shifts splat the sign bit.
31731   APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31732   if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31733     if (LogicalShift)
31734       return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31735     else
31736       ShiftVal = NumBitsPerElt - 1;
31737   }
31738
31739   // Shift N0 by zero -> N0.
31740   if (!ShiftVal)
31741     return N0;
31742
31743   // Shift zero -> zero.
31744   if (ISD::isBuildVectorAllZeros(N0.getNode()))
31745     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31746
31747   // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31748   // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31749   // TODO - support other sra opcodes as needed.
31750   if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31751       N0.getOpcode() == X86ISD::VSRAI)
31752     return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31753
31754   // We can decode 'whole byte' logical bit shifts as shuffles.
31755   if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31756     SDValue Op(N, 0);
31757     SmallVector<int, 1> NonceMask; // Just a placeholder.
31758     NonceMask.push_back(0);
31759     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31760                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31761                                       DCI, Subtarget))
31762       return SDValue(); // This routine will use CombineTo to replace N.
31763   }
31764
31765   // Constant Folding.
31766   APInt UndefElts;
31767   SmallVector<APInt, 32> EltBits;
31768   if (N->isOnlyUserOf(N0.getNode()) &&
31769       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31770     assert(EltBits.size() == VT.getVectorNumElements() &&
31771            "Unexpected shift value type");
31772     unsigned ShiftImm = ShiftVal.getZExtValue();
31773     for (APInt &Elt : EltBits) {
31774       if (X86ISD::VSHLI == Opcode)
31775         Elt <<= ShiftImm;
31776       else if (X86ISD::VSRAI == Opcode)
31777         Elt.ashrInPlace(ShiftImm);
31778       else
31779         Elt.lshrInPlace(ShiftImm);
31780     }
31781     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31782   }
31783
31784   return SDValue();
31785 }
31786
31787 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31788                                    TargetLowering::DAGCombinerInfo &DCI,
31789                                    const X86Subtarget &Subtarget) {
31790   assert(
31791       ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31792        (N->getOpcode() == X86ISD::PINSRW &&
31793         N->getValueType(0) == MVT::v8i16)) &&
31794       "Unexpected vector insertion");
31795
31796   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31797   SDValue Op(N, 0);
31798   SmallVector<int, 1> NonceMask; // Just a placeholder.
31799   NonceMask.push_back(0);
31800   combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31801                                 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31802                                 DCI, Subtarget);
31803   return SDValue();
31804 }
31805
31806 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31807 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31808 /// OR -> CMPNEQSS.
31809 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31810                                    TargetLowering::DAGCombinerInfo &DCI,
31811                                    const X86Subtarget &Subtarget) {
31812   unsigned opcode;
31813
31814   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31815   // we're requiring SSE2 for both.
31816   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31817     SDValue N0 = N->getOperand(0);
31818     SDValue N1 = N->getOperand(1);
31819     SDValue CMP0 = N0->getOperand(1);
31820     SDValue CMP1 = N1->getOperand(1);
31821     SDLoc DL(N);
31822
31823     // The SETCCs should both refer to the same CMP.
31824     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31825       return SDValue();
31826
31827     SDValue CMP00 = CMP0->getOperand(0);
31828     SDValue CMP01 = CMP0->getOperand(1);
31829     EVT     VT    = CMP00.getValueType();
31830
31831     if (VT == MVT::f32 || VT == MVT::f64) {
31832       bool ExpectingFlags = false;
31833       // Check for any users that want flags:
31834       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31835            !ExpectingFlags && UI != UE; ++UI)
31836         switch (UI->getOpcode()) {
31837         default:
31838         case ISD::BR_CC:
31839         case ISD::BRCOND:
31840         case ISD::SELECT:
31841           ExpectingFlags = true;
31842           break;
31843         case ISD::CopyToReg:
31844         case ISD::SIGN_EXTEND:
31845         case ISD::ZERO_EXTEND:
31846         case ISD::ANY_EXTEND:
31847           break;
31848         }
31849
31850       if (!ExpectingFlags) {
31851         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31852         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31853
31854         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31855           X86::CondCode tmp = cc0;
31856           cc0 = cc1;
31857           cc1 = tmp;
31858         }
31859
31860         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
31861             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31862           // FIXME: need symbolic constants for these magic numbers.
31863           // See X86ATTInstPrinter.cpp:printSSECC().
31864           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31865           if (Subtarget.hasAVX512()) {
31866             SDValue FSetCC =
31867                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31868                             DAG.getConstant(x86cc, DL, MVT::i8));
31869             return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31870                                FSetCC, DAG.getIntPtrConstant(0, DL));
31871           }
31872           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31873                                               CMP00.getValueType(), CMP00, CMP01,
31874                                               DAG.getConstant(x86cc, DL,
31875                                                               MVT::i8));
31876
31877           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31878           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31879
31880           if (is64BitFP && !Subtarget.is64Bit()) {
31881             // On a 32-bit target, we cannot bitcast the 64-bit float to a
31882             // 64-bit integer, since that's not a legal type. Since
31883             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31884             // bits, but can do this little dance to extract the lowest 32 bits
31885             // and work with those going forward.
31886             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31887                                            OnesOrZeroesF);
31888             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31889             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31890                                         Vector32, DAG.getIntPtrConstant(0, DL));
31891             IntVT = MVT::i32;
31892           }
31893
31894           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31895           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31896                                       DAG.getConstant(1, DL, IntVT));
31897           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31898                                               ANDed);
31899           return OneBitOfTruth;
31900         }
31901       }
31902     }
31903   }
31904   return SDValue();
31905 }
31906
31907 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31908 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31909   assert(N->getOpcode() == ISD::AND);
31910
31911   EVT VT = N->getValueType(0);
31912   SDValue N0 = N->getOperand(0);
31913   SDValue N1 = N->getOperand(1);
31914   SDLoc DL(N);
31915
31916   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31917     return SDValue();
31918
31919   if (N0.getOpcode() == ISD::XOR &&
31920       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31921     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31922
31923   if (N1.getOpcode() == ISD::XOR &&
31924       ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31925     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31926
31927   return SDValue();
31928 }
31929
31930 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31931 // register. In most cases we actually compare or select YMM-sized registers
31932 // and mixing the two types creates horrible code. This method optimizes
31933 // some of the transition sequences.
31934 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31935                                  TargetLowering::DAGCombinerInfo &DCI,
31936                                  const X86Subtarget &Subtarget) {
31937   EVT VT = N->getValueType(0);
31938   if (!VT.is256BitVector())
31939     return SDValue();
31940
31941   assert((N->getOpcode() == ISD::ANY_EXTEND ||
31942           N->getOpcode() == ISD::ZERO_EXTEND ||
31943           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31944
31945   SDValue Narrow = N->getOperand(0);
31946   EVT NarrowVT = Narrow->getValueType(0);
31947   if (!NarrowVT.is128BitVector())
31948     return SDValue();
31949
31950   if (Narrow->getOpcode() != ISD::XOR &&
31951       Narrow->getOpcode() != ISD::AND &&
31952       Narrow->getOpcode() != ISD::OR)
31953     return SDValue();
31954
31955   SDValue N0  = Narrow->getOperand(0);
31956   SDValue N1  = Narrow->getOperand(1);
31957   SDLoc DL(Narrow);
31958
31959   // The Left side has to be a trunc.
31960   if (N0.getOpcode() != ISD::TRUNCATE)
31961     return SDValue();
31962
31963   // The type of the truncated inputs.
31964   EVT WideVT = N0->getOperand(0)->getValueType(0);
31965   if (WideVT != VT)
31966     return SDValue();
31967
31968   // The right side has to be a 'trunc' or a constant vector.
31969   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31970   ConstantSDNode *RHSConstSplat = nullptr;
31971   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31972     RHSConstSplat = RHSBV->getConstantSplatNode();
31973   if (!RHSTrunc && !RHSConstSplat)
31974     return SDValue();
31975
31976   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31977
31978   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31979     return SDValue();
31980
31981   // Set N0 and N1 to hold the inputs to the new wide operation.
31982   N0 = N0->getOperand(0);
31983   if (RHSConstSplat) {
31984     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31985                      SDValue(RHSConstSplat, 0));
31986     N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31987   } else if (RHSTrunc) {
31988     N1 = N1->getOperand(0);
31989   }
31990
31991   // Generate the wide operation.
31992   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31993   unsigned Opcode = N->getOpcode();
31994   switch (Opcode) {
31995   case ISD::ANY_EXTEND:
31996     return Op;
31997   case ISD::ZERO_EXTEND: {
31998     unsigned InBits = NarrowVT.getScalarSizeInBits();
31999     APInt Mask = APInt::getAllOnesValue(InBits);
32000     Mask = Mask.zext(VT.getScalarSizeInBits());
32001     return DAG.getNode(ISD::AND, DL, VT,
32002                        Op, DAG.getConstant(Mask, DL, VT));
32003   }
32004   case ISD::SIGN_EXTEND:
32005     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
32006                        Op, DAG.getValueType(NarrowVT));
32007   default:
32008     llvm_unreachable("Unexpected opcode");
32009   }
32010 }
32011
32012 /// If both input operands of a logic op are being cast from floating point
32013 /// types, try to convert this into a floating point logic node to avoid
32014 /// unnecessary moves from SSE to integer registers.
32015 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
32016                                         const X86Subtarget &Subtarget) {
32017   unsigned FPOpcode = ISD::DELETED_NODE;
32018   if (N->getOpcode() == ISD::AND)
32019     FPOpcode = X86ISD::FAND;
32020   else if (N->getOpcode() == ISD::OR)
32021     FPOpcode = X86ISD::FOR;
32022   else if (N->getOpcode() == ISD::XOR)
32023     FPOpcode = X86ISD::FXOR;
32024
32025   assert(FPOpcode != ISD::DELETED_NODE &&
32026          "Unexpected input node for FP logic conversion");
32027
32028   EVT VT = N->getValueType(0);
32029   SDValue N0 = N->getOperand(0);
32030   SDValue N1 = N->getOperand(1);
32031   SDLoc DL(N);
32032   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
32033       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
32034        (Subtarget.hasSSE2() && VT == MVT::i64))) {
32035     SDValue N00 = N0.getOperand(0);
32036     SDValue N10 = N1.getOperand(0);
32037     EVT N00Type = N00.getValueType();
32038     EVT N10Type = N10.getValueType();
32039     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
32040       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32041       return DAG.getBitcast(VT, FPLogic);
32042     }
32043   }
32044   return SDValue();
32045 }
32046
32047 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
32048 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32049 /// with a shift-right to eliminate loading the vector constant mask value.
32050 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32051                                      const X86Subtarget &Subtarget) {
32052   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32053   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32054   EVT VT0 = Op0.getValueType();
32055   EVT VT1 = Op1.getValueType();
32056
32057   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
32058     return SDValue();
32059
32060   APInt SplatVal;
32061   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
32062       !SplatVal.isMask())
32063     return SDValue();
32064
32065   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
32066     return SDValue();
32067
32068   unsigned EltBitWidth = VT0.getScalarSizeInBits();
32069   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32070     return SDValue();
32071
32072   SDLoc DL(N);
32073   unsigned ShiftVal = SplatVal.countTrailingOnes();
32074   SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32075   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32076   return DAG.getBitcast(N->getValueType(0), Shift);
32077 }
32078
32079 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32080                           TargetLowering::DAGCombinerInfo &DCI,
32081                           const X86Subtarget &Subtarget) {
32082   if (DCI.isBeforeLegalizeOps())
32083     return SDValue();
32084
32085   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32086     return R;
32087
32088   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32089     return FPLogic;
32090
32091   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
32092     return R;
32093
32094   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
32095     return ShiftRight;
32096
32097   EVT VT = N->getValueType(0);
32098   SDValue N0 = N->getOperand(0);
32099   SDValue N1 = N->getOperand(1);
32100   SDLoc DL(N);
32101
32102   // Attempt to recursively combine a bitmask AND with shuffles.
32103   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
32104     SDValue Op(N, 0);
32105     SmallVector<int, 1> NonceMask; // Just a placeholder.
32106     NonceMask.push_back(0);
32107     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
32108                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
32109                                       DCI, Subtarget))
32110       return SDValue(); // This routine will use CombineTo to replace N.
32111   }
32112
32113   // Create BEXTR instructions
32114   // BEXTR is ((X >> imm) & (2**size-1))
32115   if (VT != MVT::i32 && VT != MVT::i64)
32116     return SDValue();
32117
32118   if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
32119     return SDValue();
32120   if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
32121     return SDValue();
32122
32123   ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
32124   ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32125   if (MaskNode && ShiftNode) {
32126     uint64_t Mask = MaskNode->getZExtValue();
32127     uint64_t Shift = ShiftNode->getZExtValue();
32128     if (isMask_64(Mask)) {
32129       uint64_t MaskSize = countPopulation(Mask);
32130       if (Shift + MaskSize <= VT.getSizeInBits())
32131         return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
32132                            DAG.getConstant(Shift | (MaskSize << 8), DL,
32133                                            VT));
32134     }
32135   }
32136   return SDValue();
32137 }
32138
32139 // Try to fold:
32140 //   (or (and (m, y), (pandn m, x)))
32141 // into:
32142 //   (vselect m, x, y)
32143 // As a special case, try to fold:
32144 //   (or (and (m, (sub 0, x)), (pandn m, x)))
32145 // into:
32146 //   (sub (xor X, M), M)
32147 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32148                                             const X86Subtarget &Subtarget) {
32149   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
32150
32151   SDValue N0 = N->getOperand(0);
32152   SDValue N1 = N->getOperand(1);
32153   EVT VT = N->getValueType(0);
32154
32155   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
32156         (VT.is256BitVector() && Subtarget.hasInt256())))
32157     return SDValue();
32158
32159   // Canonicalize AND to LHS.
32160   if (N1.getOpcode() == ISD::AND)
32161     std::swap(N0, N1);
32162
32163   // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32164   // ANDNP combine allows other combines to happen that prevent matching.
32165   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
32166     return SDValue();
32167
32168   SDValue Mask = N1.getOperand(0);
32169   SDValue X = N1.getOperand(1);
32170   SDValue Y;
32171   if (N0.getOperand(0) == Mask)
32172     Y = N0.getOperand(1);
32173   if (N0.getOperand(1) == Mask)
32174     Y = N0.getOperand(0);
32175
32176   // Check to see if the mask appeared in both the AND and ANDNP.
32177   if (!Y.getNode())
32178     return SDValue();
32179
32180   // Validate that X, Y, and Mask are bitcasts, and see through them.
32181   Mask = peekThroughBitcasts(Mask);
32182   X = peekThroughBitcasts(X);
32183   Y = peekThroughBitcasts(Y);
32184
32185   EVT MaskVT = Mask.getValueType();
32186   unsigned EltBits = MaskVT.getScalarSizeInBits();
32187
32188   // TODO: Attempt to handle floating point cases as well?
32189   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
32190     return SDValue();
32191
32192   SDLoc DL(N);
32193
32194   // Try to match:
32195   //   (or (and (M, (sub 0, X)), (pandn M, X)))
32196   // which is a special case of vselect:
32197   //   (vselect M, (sub 0, X), X)
32198   // Per:
32199   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32200   // We know that, if fNegate is 0 or 1:
32201   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32202   //
32203   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32204   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32205   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
32206   // This lets us transform our vselect to:
32207   //   (add (xor X, M), (and M, 1))
32208   // And further to:
32209   //   (sub (xor X, M), M)
32210   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
32211       DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
32212     auto IsNegV = [](SDNode *N, SDValue V) {
32213       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32214         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32215     };
32216     SDValue V;
32217     if (IsNegV(Y.getNode(), X))
32218       V = X;
32219     else if (IsNegV(X.getNode(), Y))
32220       V = Y;
32221
32222     if (V) {
32223       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32224       SDValue SubOp2 = Mask;
32225
32226       // If the negate was on the false side of the select, then
32227       // the operands of the SUB need to be swapped. PR 27251.
32228       // This is because the pattern being matched above is
32229       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
32230       // but if the pattern matched was
32231       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
32232       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
32233       // pattern also needs to be a negation of the replacement pattern above.
32234       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32235       // sub accomplishes the negation of the replacement pattern.
32236       if (V == Y)
32237          std::swap(SubOp1, SubOp2);
32238
32239       SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32240       return DAG.getBitcast(VT, Res);
32241     }
32242   }
32243
32244   // PBLENDVB is only available on SSE 4.1.
32245   if (!Subtarget.hasSSE41())
32246     return SDValue();
32247
32248   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
32249
32250   X = DAG.getBitcast(BlendVT, X);
32251   Y = DAG.getBitcast(BlendVT, Y);
32252   Mask = DAG.getBitcast(BlendVT, Mask);
32253   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32254   return DAG.getBitcast(VT, Mask);
32255 }
32256
32257 // Helper function for combineOrCmpEqZeroToCtlzSrl
32258 // Transforms:
32259 //   seteq(cmp x, 0)
32260 //   into:
32261 //   srl(ctlz x), log2(bitsize(x))
32262 // Input pattern is checked by caller.
32263 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32264                                           SelectionDAG &DAG) {
32265   SDValue Cmp = Op.getOperand(1);
32266   EVT VT = Cmp.getOperand(0).getValueType();
32267   unsigned Log2b = Log2_32(VT.getSizeInBits());
32268   SDLoc dl(Op);
32269   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32270   // The result of the shift is true or false, and on X86, the 32-bit
32271   // encoding of shr and lzcnt is more desirable.
32272   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32273   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32274                             DAG.getConstant(Log2b, dl, VT));
32275   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32276 }
32277
32278 // Try to transform:
32279 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32280 //   into:
32281 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32282 // Will also attempt to match more generic cases, eg:
32283 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32284 // Only applies if the target supports the FastLZCNT feature.
32285 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32286                                            TargetLowering::DAGCombinerInfo &DCI,
32287                                            const X86Subtarget &Subtarget) {
32288   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
32289     return SDValue();
32290
32291   auto isORCandidate = [](SDValue N) {
32292     return (N->getOpcode() == ISD::OR && N->hasOneUse());
32293   };
32294
32295   // Check the zero extend is extending to 32-bit or more. The code generated by
32296   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
32297   // instructions to clear the upper bits.
32298   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
32299       !isORCandidate(N->getOperand(0)))
32300     return SDValue();
32301
32302   // Check the node matches: setcc(eq, cmp 0)
32303   auto isSetCCCandidate = [](SDValue N) {
32304     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32305            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32306            N->getOperand(1).getOpcode() == X86ISD::CMP &&
32307            isNullConstant(N->getOperand(1).getOperand(1)) &&
32308            N->getOperand(1).getValueType().bitsGE(MVT::i32);
32309   };
32310
32311   SDNode *OR = N->getOperand(0).getNode();
32312   SDValue LHS = OR->getOperand(0);
32313   SDValue RHS = OR->getOperand(1);
32314
32315   // Save nodes matching or(or, setcc(eq, cmp 0)).
32316   SmallVector<SDNode *, 2> ORNodes;
32317   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
32318           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
32319     ORNodes.push_back(OR);
32320     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
32321     LHS = OR->getOperand(0);
32322     RHS = OR->getOperand(1);
32323   }
32324
32325   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32326   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
32327       !isORCandidate(SDValue(OR, 0)))
32328     return SDValue();
32329
32330   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32331   // to
32332   // or(srl(ctlz),srl(ctlz)).
32333   // The dag combiner can then fold it into:
32334   // srl(or(ctlz, ctlz)).
32335   EVT VT = OR->getValueType(0);
32336   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32337   SDValue Ret, NewRHS;
32338   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
32339     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32340
32341   if (!Ret)
32342     return SDValue();
32343
32344   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32345   while (ORNodes.size() > 0) {
32346     OR = ORNodes.pop_back_val();
32347     LHS = OR->getOperand(0);
32348     RHS = OR->getOperand(1);
32349     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32350     if (RHS->getOpcode() == ISD::OR)
32351       std::swap(LHS, RHS);
32352     EVT VT = OR->getValueType(0);
32353     SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32354     if (!NewRHS)
32355       return SDValue();
32356     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32357   }
32358
32359   if (Ret)
32360     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32361
32362   return Ret;
32363 }
32364
32365 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32366                          TargetLowering::DAGCombinerInfo &DCI,
32367                          const X86Subtarget &Subtarget) {
32368   if (DCI.isBeforeLegalizeOps())
32369     return SDValue();
32370
32371   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32372     return R;
32373
32374   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32375     return FPLogic;
32376
32377   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32378     return R;
32379
32380   SDValue N0 = N->getOperand(0);
32381   SDValue N1 = N->getOperand(1);
32382   EVT VT = N->getValueType(0);
32383
32384   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
32385     return SDValue();
32386
32387   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
32388   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32389
32390   // SHLD/SHRD instructions have lower register pressure, but on some
32391   // platforms they have higher latency than the equivalent
32392   // series of shifts/or that would otherwise be generated.
32393   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
32394   // have higher latencies and we are not optimizing for size.
32395   if (!OptForSize && Subtarget.isSHLDSlow())
32396     return SDValue();
32397
32398   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
32399     std::swap(N0, N1);
32400   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
32401     return SDValue();
32402   if (!N0.hasOneUse() || !N1.hasOneUse())
32403     return SDValue();
32404
32405   SDValue ShAmt0 = N0.getOperand(1);
32406   if (ShAmt0.getValueType() != MVT::i8)
32407     return SDValue();
32408   SDValue ShAmt1 = N1.getOperand(1);
32409   if (ShAmt1.getValueType() != MVT::i8)
32410     return SDValue();
32411   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
32412     ShAmt0 = ShAmt0.getOperand(0);
32413   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32414     ShAmt1 = ShAmt1.getOperand(0);
32415
32416   SDLoc DL(N);
32417   unsigned Opc = X86ISD::SHLD;
32418   SDValue Op0 = N0.getOperand(0);
32419   SDValue Op1 = N1.getOperand(0);
32420   if (ShAmt0.getOpcode() == ISD::SUB ||
32421       ShAmt0.getOpcode() == ISD::XOR) {
32422     Opc = X86ISD::SHRD;
32423     std::swap(Op0, Op1);
32424     std::swap(ShAmt0, ShAmt1);
32425   }
32426
32427   // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32428   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32429   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32430   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32431   unsigned Bits = VT.getSizeInBits();
32432   if (ShAmt1.getOpcode() == ISD::SUB) {
32433     SDValue Sum = ShAmt1.getOperand(0);
32434     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
32435       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32436       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32437         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32438       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32439         return DAG.getNode(Opc, DL, VT,
32440                            Op0, Op1,
32441                            DAG.getNode(ISD::TRUNCATE, DL,
32442                                        MVT::i8, ShAmt0));
32443     }
32444   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32445     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32446     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32447       return DAG.getNode(Opc, DL, VT,
32448                          N0.getOperand(0), N1.getOperand(0),
32449                          DAG.getNode(ISD::TRUNCATE, DL,
32450                                        MVT::i8, ShAmt0));
32451   } else if (ShAmt1.getOpcode() == ISD::XOR) {
32452     SDValue Mask = ShAmt1.getOperand(1);
32453     if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32454       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32455       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32456       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32457         ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32458       if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32459         if (Op1.getOpcode() == InnerShift &&
32460             isa<ConstantSDNode>(Op1.getOperand(1)) &&
32461             Op1.getConstantOperandVal(1) == 1) {
32462           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32463                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32464         }
32465         // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32466         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32467             Op1.getOperand(0) == Op1.getOperand(1)) {
32468           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32469                      DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32470         }
32471       }
32472     }
32473   }
32474
32475   return SDValue();
32476 }
32477
32478 /// Generate NEG and CMOV for integer abs.
32479 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32480   EVT VT = N->getValueType(0);
32481
32482   // Since X86 does not have CMOV for 8-bit integer, we don't convert
32483   // 8-bit integer abs to NEG and CMOV.
32484   if (VT.isInteger() && VT.getSizeInBits() == 8)
32485     return SDValue();
32486
32487   SDValue N0 = N->getOperand(0);
32488   SDValue N1 = N->getOperand(1);
32489   SDLoc DL(N);
32490
32491   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32492   // and change it to SUB and CMOV.
32493   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32494       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32495       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32496     auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32497     if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32498       // Generate SUB & CMOV.
32499       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32500                                 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32501       SDValue Ops[] = {N0.getOperand(0), Neg,
32502                        DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32503                        SDValue(Neg.getNode(), 1)};
32504       return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32505     }
32506   }
32507   return SDValue();
32508 }
32509
32510 /// Try to turn tests against the signbit in the form of:
32511 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32512 /// into:
32513 ///   SETGT(X, -1)
32514 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32515   // This is only worth doing if the output type is i8 or i1.
32516   EVT ResultType = N->getValueType(0);
32517   if (ResultType != MVT::i8 && ResultType != MVT::i1)
32518     return SDValue();
32519
32520   SDValue N0 = N->getOperand(0);
32521   SDValue N1 = N->getOperand(1);
32522
32523   // We should be performing an xor against a truncated shift.
32524   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32525     return SDValue();
32526
32527   // Make sure we are performing an xor against one.
32528   if (!isOneConstant(N1))
32529     return SDValue();
32530
32531   // SetCC on x86 zero extends so only act on this if it's a logical shift.
32532   SDValue Shift = N0.getOperand(0);
32533   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32534     return SDValue();
32535
32536   // Make sure we are truncating from one of i16, i32 or i64.
32537   EVT ShiftTy = Shift.getValueType();
32538   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32539     return SDValue();
32540
32541   // Make sure the shift amount extracts the sign bit.
32542   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32543       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32544     return SDValue();
32545
32546   // Create a greater-than comparison against -1.
32547   // N.B. Using SETGE against 0 works but we want a canonical looking
32548   // comparison, using SETGT matches up with what TranslateX86CC.
32549   SDLoc DL(N);
32550   SDValue ShiftOp = Shift.getOperand(0);
32551   EVT ShiftOpTy = ShiftOp.getValueType();
32552   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32553   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32554                                                *DAG.getContext(), ResultType);
32555   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32556                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32557   if (SetCCResultType != ResultType)
32558     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32559   return Cond;
32560 }
32561
32562 /// Turn vector tests of the signbit in the form of:
32563 ///   xor (sra X, elt_size(X)-1), -1
32564 /// into:
32565 ///   pcmpgt X, -1
32566 ///
32567 /// This should be called before type legalization because the pattern may not
32568 /// persist after that.
32569 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32570                                          const X86Subtarget &Subtarget) {
32571   EVT VT = N->getValueType(0);
32572   if (!VT.isSimple())
32573     return SDValue();
32574
32575   switch (VT.getSimpleVT().SimpleTy) {
32576   default: return SDValue();
32577   case MVT::v16i8:
32578   case MVT::v8i16:
32579   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32580   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32581   case MVT::v32i8:
32582   case MVT::v16i16:
32583   case MVT::v8i32:
32584   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32585   }
32586
32587   // There must be a shift right algebraic before the xor, and the xor must be a
32588   // 'not' operation.
32589   SDValue Shift = N->getOperand(0);
32590   SDValue Ones = N->getOperand(1);
32591   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32592       !ISD::isBuildVectorAllOnes(Ones.getNode()))
32593     return SDValue();
32594
32595   // The shift should be smearing the sign bit across each vector element.
32596   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32597   if (!ShiftBV)
32598     return SDValue();
32599
32600   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32601   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32602   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32603     return SDValue();
32604
32605   // Create a greater-than comparison against -1. We don't use the more obvious
32606   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32607   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32608 }
32609
32610 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32611 /// is valid for the given \p Subtarget.
32612 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32613                                         const X86Subtarget &Subtarget) {
32614   if (!Subtarget.hasAVX512())
32615     return false;
32616
32617   // FIXME: Scalar type may be supported if we move it to vector register.
32618   if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32619     return false;
32620
32621   EVT SrcElVT = SrcVT.getScalarType();
32622   EVT DstElVT = DstVT.getScalarType();
32623   if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32624     return false;
32625   if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32626     return false;
32627   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32628     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32629   return false;
32630 }
32631
32632 /// Detect a pattern of truncation with saturation:
32633 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32634 /// Return the source value to be truncated or SDValue() if the pattern was not
32635 /// matched.
32636 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32637   if (In.getOpcode() != ISD::UMIN)
32638     return SDValue();
32639
32640   //Saturation with truncation. We truncate from InVT to VT.
32641   assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32642     "Unexpected types for truncate operation");
32643
32644   APInt C;
32645   if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32646     // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32647     // the element size of the destination type.
32648     return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32649       SDValue();
32650   }
32651   return SDValue();
32652 }
32653
32654 /// Detect a pattern of truncation with saturation:
32655 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32656 /// The types should allow to use VPMOVUS* instruction on AVX512.
32657 /// Return the source value to be truncated or SDValue() if the pattern was not
32658 /// matched.
32659 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32660                                        const X86Subtarget &Subtarget) {
32661   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32662     return SDValue();
32663   return detectUSatPattern(In, VT);
32664 }
32665
32666 static SDValue
32667 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32668                         const X86Subtarget &Subtarget) {
32669   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32670   if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32671     return SDValue();
32672   if (auto USatVal = detectUSatPattern(In, VT))
32673     if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32674       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32675   return SDValue();
32676 }
32677
32678 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32679 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32680 /// X86ISD::AVG instruction.
32681 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32682                                 const X86Subtarget &Subtarget,
32683                                 const SDLoc &DL) {
32684   if (!VT.isVector() || !VT.isSimple())
32685     return SDValue();
32686   EVT InVT = In.getValueType();
32687   unsigned NumElems = VT.getVectorNumElements();
32688
32689   EVT ScalarVT = VT.getVectorElementType();
32690   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32691         isPowerOf2_32(NumElems)))
32692     return SDValue();
32693
32694   // InScalarVT is the intermediate type in AVG pattern and it should be greater
32695   // than the original input type (i8/i16).
32696   EVT InScalarVT = InVT.getVectorElementType();
32697   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32698     return SDValue();
32699
32700   if (!Subtarget.hasSSE2())
32701     return SDValue();
32702   if (Subtarget.hasBWI()) {
32703     if (VT.getSizeInBits() > 512)
32704       return SDValue();
32705   } else if (Subtarget.hasAVX2()) {
32706     if (VT.getSizeInBits() > 256)
32707       return SDValue();
32708   } else {
32709     if (VT.getSizeInBits() > 128)
32710       return SDValue();
32711   }
32712
32713   // Detect the following pattern:
32714   //
32715   //   %1 = zext <N x i8> %a to <N x i32>
32716   //   %2 = zext <N x i8> %b to <N x i32>
32717   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32718   //   %4 = add nuw nsw <N x i32> %3, %2
32719   //   %5 = lshr <N x i32> %N, <i32 1 x N>
32720   //   %6 = trunc <N x i32> %5 to <N x i8>
32721   //
32722   // In AVX512, the last instruction can also be a trunc store.
32723
32724   if (In.getOpcode() != ISD::SRL)
32725     return SDValue();
32726
32727   // A lambda checking the given SDValue is a constant vector and each element
32728   // is in the range [Min, Max].
32729   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32730     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32731     if (!BV || !BV->isConstant())
32732       return false;
32733     for (SDValue Op : V->ops()) {
32734       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32735       if (!C)
32736         return false;
32737       uint64_t Val = C->getZExtValue();
32738       if (Val < Min || Val > Max)
32739         return false;
32740     }
32741     return true;
32742   };
32743
32744   // Check if each element of the vector is left-shifted by one.
32745   auto LHS = In.getOperand(0);
32746   auto RHS = In.getOperand(1);
32747   if (!IsConstVectorInRange(RHS, 1, 1))
32748     return SDValue();
32749   if (LHS.getOpcode() != ISD::ADD)
32750     return SDValue();
32751
32752   // Detect a pattern of a + b + 1 where the order doesn't matter.
32753   SDValue Operands[3];
32754   Operands[0] = LHS.getOperand(0);
32755   Operands[1] = LHS.getOperand(1);
32756
32757   // Take care of the case when one of the operands is a constant vector whose
32758   // element is in the range [1, 256].
32759   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32760       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32761       Operands[0].getOperand(0).getValueType() == VT) {
32762     // The pattern is detected. Subtract one from the constant vector, then
32763     // demote it and emit X86ISD::AVG instruction.
32764     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32765     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32766     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32767     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32768                        Operands[1]);
32769   }
32770
32771   if (Operands[0].getOpcode() == ISD::ADD)
32772     std::swap(Operands[0], Operands[1]);
32773   else if (Operands[1].getOpcode() != ISD::ADD)
32774     return SDValue();
32775   Operands[2] = Operands[1].getOperand(0);
32776   Operands[1] = Operands[1].getOperand(1);
32777
32778   // Now we have three operands of two additions. Check that one of them is a
32779   // constant vector with ones, and the other two are promoted from i8/i16.
32780   for (int i = 0; i < 3; ++i) {
32781     if (!IsConstVectorInRange(Operands[i], 1, 1))
32782       continue;
32783     std::swap(Operands[i], Operands[2]);
32784
32785     // Check if Operands[0] and Operands[1] are results of type promotion.
32786     for (int j = 0; j < 2; ++j)
32787       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32788           Operands[j].getOperand(0).getValueType() != VT)
32789         return SDValue();
32790
32791     // The pattern is detected, emit X86ISD::AVG instruction.
32792     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32793                        Operands[1].getOperand(0));
32794   }
32795
32796   return SDValue();
32797 }
32798
32799 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32800                            TargetLowering::DAGCombinerInfo &DCI,
32801                            const X86Subtarget &Subtarget) {
32802   LoadSDNode *Ld = cast<LoadSDNode>(N);
32803   EVT RegVT = Ld->getValueType(0);
32804   EVT MemVT = Ld->getMemoryVT();
32805   SDLoc dl(Ld);
32806   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32807
32808   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32809   // into two 16-byte operations. Also split non-temporal aligned loads on
32810   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32811   ISD::LoadExtType Ext = Ld->getExtensionType();
32812   bool Fast;
32813   unsigned AddressSpace = Ld->getAddressSpace();
32814   unsigned Alignment = Ld->getAlignment();
32815   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32816       Ext == ISD::NON_EXTLOAD &&
32817       ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
32818        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32819                                AddressSpace, Alignment, &Fast) && !Fast))) {
32820     unsigned NumElems = RegVT.getVectorNumElements();
32821     if (NumElems < 2)
32822       return SDValue();
32823
32824     SDValue Ptr = Ld->getBasePtr();
32825
32826     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32827                                   NumElems/2);
32828     SDValue Load1 =
32829         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32830                     Alignment, Ld->getMemOperand()->getFlags());
32831
32832     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32833     SDValue Load2 =
32834         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32835                     std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32836     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32837                              Load1.getValue(1),
32838                              Load2.getValue(1));
32839
32840     SDValue NewVec = DAG.getUNDEF(RegVT);
32841     NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32842     NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32843     return DCI.CombineTo(N, NewVec, TF, true);
32844   }
32845
32846   return SDValue();
32847 }
32848
32849 /// If V is a build vector of boolean constants and exactly one of those
32850 /// constants is true, return the operand index of that true element.
32851 /// Otherwise, return -1.
32852 static int getOneTrueElt(SDValue V) {
32853   // This needs to be a build vector of booleans.
32854   // TODO: Checking for the i1 type matches the IR definition for the mask,
32855   // but the mask check could be loosened to i8 or other types. That might
32856   // also require checking more than 'allOnesValue'; eg, the x86 HW
32857   // instructions only require that the MSB is set for each mask element.
32858   // The ISD::MSTORE comments/definition do not specify how the mask operand
32859   // is formatted.
32860   auto *BV = dyn_cast<BuildVectorSDNode>(V);
32861   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32862     return -1;
32863
32864   int TrueIndex = -1;
32865   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32866   for (unsigned i = 0; i < NumElts; ++i) {
32867     const SDValue &Op = BV->getOperand(i);
32868     if (Op.isUndef())
32869       continue;
32870     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32871     if (!ConstNode)
32872       return -1;
32873     if (ConstNode->getAPIntValue().isAllOnesValue()) {
32874       // If we already found a one, this is too many.
32875       if (TrueIndex >= 0)
32876         return -1;
32877       TrueIndex = i;
32878     }
32879   }
32880   return TrueIndex;
32881 }
32882
32883 /// Given a masked memory load/store operation, return true if it has one mask
32884 /// bit set. If it has one mask bit set, then also return the memory address of
32885 /// the scalar element to load/store, the vector index to insert/extract that
32886 /// scalar element, and the alignment for the scalar memory access.
32887 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32888                                          SelectionDAG &DAG, SDValue &Addr,
32889                                          SDValue &Index, unsigned &Alignment) {
32890   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32891   if (TrueMaskElt < 0)
32892     return false;
32893
32894   // Get the address of the one scalar element that is specified by the mask
32895   // using the appropriate offset from the base pointer.
32896   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32897   Addr = MaskedOp->getBasePtr();
32898   if (TrueMaskElt != 0) {
32899     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32900     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32901   }
32902
32903   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32904   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32905   return true;
32906 }
32907
32908 /// If exactly one element of the mask is set for a non-extending masked load,
32909 /// it is a scalar load and vector insert.
32910 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32911 /// mask have already been optimized in IR, so we don't bother with those here.
32912 static SDValue
32913 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32914                              TargetLowering::DAGCombinerInfo &DCI) {
32915   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32916   // However, some target hooks may need to be added to know when the transform
32917   // is profitable. Endianness would also have to be considered.
32918
32919   SDValue Addr, VecIndex;
32920   unsigned Alignment;
32921   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32922     return SDValue();
32923
32924   // Load the one scalar element that is specified by the mask using the
32925   // appropriate offset from the base pointer.
32926   SDLoc DL(ML);
32927   EVT VT = ML->getValueType(0);
32928   EVT EltVT = VT.getVectorElementType();
32929   SDValue Load =
32930       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32931                   Alignment, ML->getMemOperand()->getFlags());
32932
32933   // Insert the loaded element into the appropriate place in the vector.
32934   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32935                                Load, VecIndex);
32936   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32937 }
32938
32939 static SDValue
32940 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32941                               TargetLowering::DAGCombinerInfo &DCI) {
32942   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32943     return SDValue();
32944
32945   SDLoc DL(ML);
32946   EVT VT = ML->getValueType(0);
32947
32948   // If we are loading the first and last elements of a vector, it is safe and
32949   // always faster to load the whole vector. Replace the masked load with a
32950   // vector load and select.
32951   unsigned NumElts = VT.getVectorNumElements();
32952   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32953   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32954   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32955   if (LoadFirstElt && LoadLastElt) {
32956     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32957                                 ML->getMemOperand());
32958     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32959     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32960   }
32961
32962   // Convert a masked load with a constant mask into a masked load and a select.
32963   // This allows the select operation to use a faster kind of select instruction
32964   // (for example, vblendvps -> vblendps).
32965
32966   // Don't try this if the pass-through operand is already undefined. That would
32967   // cause an infinite loop because that's what we're about to create.
32968   if (ML->getSrc0().isUndef())
32969     return SDValue();
32970
32971   // The new masked load has an undef pass-through operand. The select uses the
32972   // original pass-through operand.
32973   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32974                                     ML->getMask(), DAG.getUNDEF(VT),
32975                                     ML->getMemoryVT(), ML->getMemOperand(),
32976                                     ML->getExtensionType());
32977   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32978
32979   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32980 }
32981
32982 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32983                                  TargetLowering::DAGCombinerInfo &DCI,
32984                                  const X86Subtarget &Subtarget) {
32985   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32986
32987   // TODO: Expanding load with constant mask may be optimized as well.
32988   if (Mld->isExpandingLoad())
32989     return SDValue();
32990
32991   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32992     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32993       return ScalarLoad;
32994     // TODO: Do some AVX512 subsets benefit from this transform?
32995     if (!Subtarget.hasAVX512())
32996       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32997         return Blend;
32998   }
32999
33000   if (Mld->getExtensionType() != ISD::SEXTLOAD)
33001     return SDValue();
33002
33003   // Resolve extending loads.
33004   EVT VT = Mld->getValueType(0);
33005   unsigned NumElems = VT.getVectorNumElements();
33006   EVT LdVT = Mld->getMemoryVT();
33007   SDLoc dl(Mld);
33008
33009   assert(LdVT != VT && "Cannot extend to the same type");
33010   unsigned ToSz = VT.getScalarSizeInBits();
33011   unsigned FromSz = LdVT.getScalarSizeInBits();
33012   // From/To sizes and ElemCount must be pow of two.
33013   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33014     "Unexpected size for extending masked load");
33015
33016   unsigned SizeRatio  = ToSz / FromSz;
33017   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
33018
33019   // Create a type on which we perform the shuffle.
33020   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33021           LdVT.getScalarType(), NumElems*SizeRatio);
33022   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33023
33024   // Convert Src0 value.
33025   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
33026   if (!Mld->getSrc0().isUndef()) {
33027     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33028     for (unsigned i = 0; i != NumElems; ++i)
33029       ShuffleVec[i] = i * SizeRatio;
33030
33031     // Can't shuffle using an illegal type.
33032     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33033            "WideVecVT should be legal");
33034     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33035                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
33036   }
33037   // Prepare the new mask.
33038   SDValue NewMask;
33039   SDValue Mask = Mld->getMask();
33040   if (Mask.getValueType() == VT) {
33041     // Mask and original value have the same type.
33042     NewMask = DAG.getBitcast(WideVecVT, Mask);
33043     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33044     for (unsigned i = 0; i != NumElems; ++i)
33045       ShuffleVec[i] = i * SizeRatio;
33046     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
33047       ShuffleVec[i] = NumElems * SizeRatio;
33048     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33049                                    DAG.getConstant(0, dl, WideVecVT),
33050                                    ShuffleVec);
33051   } else {
33052     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33053     unsigned WidenNumElts = NumElems*SizeRatio;
33054     unsigned MaskNumElts = VT.getVectorNumElements();
33055     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
33056                                      WidenNumElts);
33057
33058     unsigned NumConcat = WidenNumElts / MaskNumElts;
33059     SmallVector<SDValue, 16> Ops(NumConcat);
33060     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33061     Ops[0] = Mask;
33062     for (unsigned i = 1; i != NumConcat; ++i)
33063       Ops[i] = ZeroVal;
33064
33065     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33066   }
33067
33068   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33069                                      Mld->getBasePtr(), NewMask, WideSrc0,
33070                                      Mld->getMemoryVT(), Mld->getMemOperand(),
33071                                      ISD::NON_EXTLOAD);
33072   SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33073   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33074 }
33075
33076 /// If exactly one element of the mask is set for a non-truncating masked store,
33077 /// it is a vector extract and scalar store.
33078 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33079 /// mask have already been optimized in IR, so we don't bother with those here.
33080 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33081                                               SelectionDAG &DAG) {
33082   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33083   // However, some target hooks may need to be added to know when the transform
33084   // is profitable. Endianness would also have to be considered.
33085
33086   SDValue Addr, VecIndex;
33087   unsigned Alignment;
33088   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33089     return SDValue();
33090
33091   // Extract the one scalar element that is actually being stored.
33092   SDLoc DL(MS);
33093   EVT VT = MS->getValue().getValueType();
33094   EVT EltVT = VT.getVectorElementType();
33095   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33096                                 MS->getValue(), VecIndex);
33097
33098   // Store that element at the appropriate offset from the base pointer.
33099   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33100                       Alignment, MS->getMemOperand()->getFlags());
33101 }
33102
33103 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33104                                   const X86Subtarget &Subtarget) {
33105   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
33106
33107   if (Mst->isCompressingStore())
33108     return SDValue();
33109
33110   if (!Mst->isTruncatingStore())
33111     return reduceMaskedStoreToScalarStore(Mst, DAG);
33112
33113   // Resolve truncating stores.
33114   EVT VT = Mst->getValue().getValueType();
33115   unsigned NumElems = VT.getVectorNumElements();
33116   EVT StVT = Mst->getMemoryVT();
33117   SDLoc dl(Mst);
33118
33119   assert(StVT != VT && "Cannot truncate to the same type");
33120   unsigned FromSz = VT.getScalarSizeInBits();
33121   unsigned ToSz = StVT.getScalarSizeInBits();
33122
33123   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33124
33125   // The truncating store is legal in some cases. For example
33126   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33127   // are designated for truncate store.
33128   // In this case we don't need any further transformations.
33129   if (TLI.isTruncStoreLegal(VT, StVT))
33130     return SDValue();
33131
33132   // From/To sizes and ElemCount must be pow of two.
33133   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33134     "Unexpected size for truncating masked store");
33135   // We are going to use the original vector elt for storing.
33136   // Accumulated smaller vector elements must be a multiple of the store size.
33137   assert (((NumElems * FromSz) % ToSz) == 0 &&
33138           "Unexpected ratio for truncating masked store");
33139
33140   unsigned SizeRatio  = FromSz / ToSz;
33141   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33142
33143   // Create a type on which we perform the shuffle.
33144   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33145           StVT.getScalarType(), NumElems*SizeRatio);
33146
33147   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33148
33149   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33150   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33151   for (unsigned i = 0; i != NumElems; ++i)
33152     ShuffleVec[i] = i * SizeRatio;
33153
33154   // Can't shuffle using an illegal type.
33155   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33156          "WideVecVT should be legal");
33157
33158   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33159                                               DAG.getUNDEF(WideVecVT),
33160                                               ShuffleVec);
33161
33162   SDValue NewMask;
33163   SDValue Mask = Mst->getMask();
33164   if (Mask.getValueType() == VT) {
33165     // Mask and original value have the same type.
33166     NewMask = DAG.getBitcast(WideVecVT, Mask);
33167     for (unsigned i = 0; i != NumElems; ++i)
33168       ShuffleVec[i] = i * SizeRatio;
33169     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
33170       ShuffleVec[i] = NumElems*SizeRatio;
33171     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33172                                    DAG.getConstant(0, dl, WideVecVT),
33173                                    ShuffleVec);
33174   } else {
33175     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33176     unsigned WidenNumElts = NumElems*SizeRatio;
33177     unsigned MaskNumElts = VT.getVectorNumElements();
33178     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
33179                                      WidenNumElts);
33180
33181     unsigned NumConcat = WidenNumElts / MaskNumElts;
33182     SmallVector<SDValue, 16> Ops(NumConcat);
33183     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33184     Ops[0] = Mask;
33185     for (unsigned i = 1; i != NumConcat; ++i)
33186       Ops[i] = ZeroVal;
33187
33188     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33189   }
33190
33191   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33192                             Mst->getBasePtr(), NewMask, StVT,
33193                             Mst->getMemOperand(), false);
33194 }
33195
33196 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33197                             const X86Subtarget &Subtarget) {
33198   StoreSDNode *St = cast<StoreSDNode>(N);
33199   EVT VT = St->getValue().getValueType();
33200   EVT StVT = St->getMemoryVT();
33201   SDLoc dl(St);
33202   SDValue StoredVal = St->getOperand(1);
33203   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33204
33205   // If we are saving a concatenation of two XMM registers and 32-byte stores
33206   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
33207   bool Fast;
33208   unsigned AddressSpace = St->getAddressSpace();
33209   unsigned Alignment = St->getAlignment();
33210   if (VT.is256BitVector() && StVT == VT &&
33211       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33212                              AddressSpace, Alignment, &Fast) &&
33213       !Fast) {
33214     unsigned NumElems = VT.getVectorNumElements();
33215     if (NumElems < 2)
33216       return SDValue();
33217
33218     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33219     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33220
33221     SDValue Ptr0 = St->getBasePtr();
33222     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33223
33224     SDValue Ch0 =
33225         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33226                      Alignment, St->getMemOperand()->getFlags());
33227     SDValue Ch1 =
33228         DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33229                      std::min(16U, Alignment), St->getMemOperand()->getFlags());
33230     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33231   }
33232
33233   // Optimize trunc store (of multiple scalars) to shuffle and store.
33234   // First, pack all of the elements in one place. Next, store to memory
33235   // in fewer chunks.
33236   if (St->isTruncatingStore() && VT.isVector()) {
33237     // Check if we can detect an AVG pattern from the truncation. If yes,
33238     // replace the trunc store by a normal store with the result of X86ISD::AVG
33239     // instruction.
33240     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33241                                        Subtarget, dl))
33242       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33243                           St->getPointerInfo(), St->getAlignment(),
33244                           St->getMemOperand()->getFlags());
33245
33246     if (SDValue Val =
33247         detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33248       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33249                              dl, Val, St->getBasePtr(),
33250                              St->getMemoryVT(), St->getMemOperand(), DAG);
33251
33252     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33253     unsigned NumElems = VT.getVectorNumElements();
33254     assert(StVT != VT && "Cannot truncate to the same type");
33255     unsigned FromSz = VT.getScalarSizeInBits();
33256     unsigned ToSz = StVT.getScalarSizeInBits();
33257
33258     // The truncating store is legal in some cases. For example
33259     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33260     // are designated for truncate store.
33261     // In this case we don't need any further transformations.
33262     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33263       return SDValue();
33264
33265     // From, To sizes and ElemCount must be pow of two
33266     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
33267     // We are going to use the original vector elt for storing.
33268     // Accumulated smaller vector elements must be a multiple of the store size.
33269     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
33270
33271     unsigned SizeRatio  = FromSz / ToSz;
33272
33273     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33274
33275     // Create a type on which we perform the shuffle
33276     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33277             StVT.getScalarType(), NumElems*SizeRatio);
33278
33279     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33280
33281     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33282     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33283     for (unsigned i = 0; i != NumElems; ++i)
33284       ShuffleVec[i] = i * SizeRatio;
33285
33286     // Can't shuffle using an illegal type.
33287     if (!TLI.isTypeLegal(WideVecVT))
33288       return SDValue();
33289
33290     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33291                                          DAG.getUNDEF(WideVecVT),
33292                                          ShuffleVec);
33293     // At this point all of the data is stored at the bottom of the
33294     // register. We now need to save it to mem.
33295
33296     // Find the largest store unit
33297     MVT StoreType = MVT::i8;
33298     for (MVT Tp : MVT::integer_valuetypes()) {
33299       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
33300         StoreType = Tp;
33301     }
33302
33303     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33304     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
33305         (64 <= NumElems * ToSz))
33306       StoreType = MVT::f64;
33307
33308     // Bitcast the original vector into a vector of store-size units
33309     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33310             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33311     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
33312     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33313     SmallVector<SDValue, 8> Chains;
33314     SDValue Ptr = St->getBasePtr();
33315
33316     // Perform one or more big stores into memory.
33317     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
33318       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33319                                    StoreType, ShuffWide,
33320                                    DAG.getIntPtrConstant(i, dl));
33321       SDValue Ch =
33322           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33323                        St->getAlignment(), St->getMemOperand()->getFlags());
33324       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33325       Chains.push_back(Ch);
33326     }
33327
33328     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33329   }
33330
33331   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
33332   // the FP state in cases where an emms may be missing.
33333   // A preferable solution to the general problem is to figure out the right
33334   // places to insert EMMS.  This qualifies as a quick hack.
33335
33336   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33337   if (VT.getSizeInBits() != 64)
33338     return SDValue();
33339
33340   const Function *F = DAG.getMachineFunction().getFunction();
33341   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33342   bool F64IsLegal =
33343       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
33344   if ((VT.isVector() ||
33345        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
33346       isa<LoadSDNode>(St->getValue()) &&
33347       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
33348       St->getChain().hasOneUse() && !St->isVolatile()) {
33349     SDNode* LdVal = St->getValue().getNode();
33350     LoadSDNode *Ld = nullptr;
33351     int TokenFactorIndex = -1;
33352     SmallVector<SDValue, 8> Ops;
33353     SDNode* ChainVal = St->getChain().getNode();
33354     // Must be a store of a load.  We currently handle two cases:  the load
33355     // is a direct child, and it's under an intervening TokenFactor.  It is
33356     // possible to dig deeper under nested TokenFactors.
33357     if (ChainVal == LdVal)
33358       Ld = cast<LoadSDNode>(St->getChain());
33359     else if (St->getValue().hasOneUse() &&
33360              ChainVal->getOpcode() == ISD::TokenFactor) {
33361       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
33362         if (ChainVal->getOperand(i).getNode() == LdVal) {
33363           TokenFactorIndex = i;
33364           Ld = cast<LoadSDNode>(St->getValue());
33365         } else
33366           Ops.push_back(ChainVal->getOperand(i));
33367       }
33368     }
33369
33370     if (!Ld || !ISD::isNormalLoad(Ld))
33371       return SDValue();
33372
33373     // If this is not the MMX case, i.e. we are just turning i64 load/store
33374     // into f64 load/store, avoid the transformation if there are multiple
33375     // uses of the loaded value.
33376     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
33377       return SDValue();
33378
33379     SDLoc LdDL(Ld);
33380     SDLoc StDL(N);
33381     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
33382     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33383     // pair instead.
33384     if (Subtarget.is64Bit() || F64IsLegal) {
33385       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
33386       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33387                                   Ld->getPointerInfo(), Ld->getAlignment(),
33388                                   Ld->getMemOperand()->getFlags());
33389       // Make sure new load is placed in same chain order.
33390       SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
33391       if (TokenFactorIndex >= 0) {
33392         Ops.push_back(NewChain);
33393         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33394       }
33395       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33396                           St->getPointerInfo(), St->getAlignment(),
33397                           St->getMemOperand()->getFlags());
33398     }
33399
33400     // Otherwise, lower to two pairs of 32-bit loads / stores.
33401     SDValue LoAddr = Ld->getBasePtr();
33402     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33403
33404     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33405                                Ld->getPointerInfo(), Ld->getAlignment(),
33406                                Ld->getMemOperand()->getFlags());
33407     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33408                                Ld->getPointerInfo().getWithOffset(4),
33409                                MinAlign(Ld->getAlignment(), 4),
33410                                Ld->getMemOperand()->getFlags());
33411     // Make sure new loads are placed in same chain order.
33412     SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
33413     NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
33414
33415     if (TokenFactorIndex >= 0) {
33416       Ops.push_back(NewChain);
33417       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33418     }
33419
33420     LoAddr = St->getBasePtr();
33421     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33422
33423     SDValue LoSt =
33424         DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33425                      St->getAlignment(), St->getMemOperand()->getFlags());
33426     SDValue HiSt = DAG.getStore(
33427         NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33428         MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33429     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33430   }
33431
33432   // This is similar to the above case, but here we handle a scalar 64-bit
33433   // integer store that is extracted from a vector on a 32-bit target.
33434   // If we have SSE2, then we can treat it like a floating-point double
33435   // to get past legalization. The execution dependencies fixup pass will
33436   // choose the optimal machine instruction for the store if this really is
33437   // an integer or v2f32 rather than an f64.
33438   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
33439       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
33440     SDValue OldExtract = St->getOperand(1);
33441     SDValue ExtOp0 = OldExtract.getOperand(0);
33442     unsigned VecSize = ExtOp0.getValueSizeInBits();
33443     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33444     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33445     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33446                                      BitCast, OldExtract.getOperand(1));
33447     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33448                         St->getPointerInfo(), St->getAlignment(),
33449                         St->getMemOperand()->getFlags());
33450   }
33451
33452   return SDValue();
33453 }
33454
33455 /// Return 'true' if this vector operation is "horizontal"
33456 /// and return the operands for the horizontal operation in LHS and RHS.  A
33457 /// horizontal operation performs the binary operation on successive elements
33458 /// of its first operand, then on successive elements of its second operand,
33459 /// returning the resulting values in a vector.  For example, if
33460 ///   A = < float a0, float a1, float a2, float a3 >
33461 /// and
33462 ///   B = < float b0, float b1, float b2, float b3 >
33463 /// then the result of doing a horizontal operation on A and B is
33464 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33465 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33466 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33467 /// set to A, RHS to B, and the routine returns 'true'.
33468 /// Note that the binary operation should have the property that if one of the
33469 /// operands is UNDEF then the result is UNDEF.
33470 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33471   // Look for the following pattern: if
33472   //   A = < float a0, float a1, float a2, float a3 >
33473   //   B = < float b0, float b1, float b2, float b3 >
33474   // and
33475   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33476   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33477   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33478   // which is A horizontal-op B.
33479
33480   // At least one of the operands should be a vector shuffle.
33481   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33482       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33483     return false;
33484
33485   MVT VT = LHS.getSimpleValueType();
33486
33487   assert((VT.is128BitVector() || VT.is256BitVector()) &&
33488          "Unsupported vector type for horizontal add/sub");
33489
33490   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33491   // operate independently on 128-bit lanes.
33492   unsigned NumElts = VT.getVectorNumElements();
33493   unsigned NumLanes = VT.getSizeInBits()/128;
33494   unsigned NumLaneElts = NumElts / NumLanes;
33495   assert((NumLaneElts % 2 == 0) &&
33496          "Vector type should have an even number of elements in each lane");
33497   unsigned HalfLaneElts = NumLaneElts/2;
33498
33499   // View LHS in the form
33500   //   LHS = VECTOR_SHUFFLE A, B, LMask
33501   // If LHS is not a shuffle then pretend it is the shuffle
33502   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33503   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33504   // type VT.
33505   SDValue A, B;
33506   SmallVector<int, 16> LMask(NumElts);
33507   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33508     if (!LHS.getOperand(0).isUndef())
33509       A = LHS.getOperand(0);
33510     if (!LHS.getOperand(1).isUndef())
33511       B = LHS.getOperand(1);
33512     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33513     std::copy(Mask.begin(), Mask.end(), LMask.begin());
33514   } else {
33515     if (!LHS.isUndef())
33516       A = LHS;
33517     for (unsigned i = 0; i != NumElts; ++i)
33518       LMask[i] = i;
33519   }
33520
33521   // Likewise, view RHS in the form
33522   //   RHS = VECTOR_SHUFFLE C, D, RMask
33523   SDValue C, D;
33524   SmallVector<int, 16> RMask(NumElts);
33525   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33526     if (!RHS.getOperand(0).isUndef())
33527       C = RHS.getOperand(0);
33528     if (!RHS.getOperand(1).isUndef())
33529       D = RHS.getOperand(1);
33530     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33531     std::copy(Mask.begin(), Mask.end(), RMask.begin());
33532   } else {
33533     if (!RHS.isUndef())
33534       C = RHS;
33535     for (unsigned i = 0; i != NumElts; ++i)
33536       RMask[i] = i;
33537   }
33538
33539   // Check that the shuffles are both shuffling the same vectors.
33540   if (!(A == C && B == D) && !(A == D && B == C))
33541     return false;
33542
33543   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33544   if (!A.getNode() && !B.getNode())
33545     return false;
33546
33547   // If A and B occur in reverse order in RHS, then "swap" them (which means
33548   // rewriting the mask).
33549   if (A != C)
33550     ShuffleVectorSDNode::commuteMask(RMask);
33551
33552   // At this point LHS and RHS are equivalent to
33553   //   LHS = VECTOR_SHUFFLE A, B, LMask
33554   //   RHS = VECTOR_SHUFFLE A, B, RMask
33555   // Check that the masks correspond to performing a horizontal operation.
33556   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33557     for (unsigned i = 0; i != NumLaneElts; ++i) {
33558       int LIdx = LMask[i+l], RIdx = RMask[i+l];
33559
33560       // Ignore any UNDEF components.
33561       if (LIdx < 0 || RIdx < 0 ||
33562           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33563           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33564         continue;
33565
33566       // Check that successive elements are being operated on.  If not, this is
33567       // not a horizontal operation.
33568       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33569       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33570       if (!(LIdx == Index && RIdx == Index + 1) &&
33571           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33572         return false;
33573     }
33574   }
33575
33576   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33577   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33578   return true;
33579 }
33580
33581 /// Do target-specific dag combines on floating-point adds/subs.
33582 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33583                                const X86Subtarget &Subtarget) {
33584   EVT VT = N->getValueType(0);
33585   SDValue LHS = N->getOperand(0);
33586   SDValue RHS = N->getOperand(1);
33587   bool IsFadd = N->getOpcode() == ISD::FADD;
33588   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33589
33590   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33591   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33592        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33593       isHorizontalBinOp(LHS, RHS, IsFadd)) {
33594     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33595     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33596   }
33597   return SDValue();
33598 }
33599
33600 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33601 /// the codegen.
33602 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33603 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33604                                           const X86Subtarget &Subtarget,
33605                                           SDLoc &DL) {
33606   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33607   SDValue Src = N->getOperand(0);
33608   unsigned Opcode = Src.getOpcode();
33609   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33610
33611   EVT VT = N->getValueType(0);
33612   EVT SrcVT = Src.getValueType();
33613
33614   auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33615     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33616
33617     // Repeated operand, so we are only trading one output truncation for
33618     // one input truncation.
33619     if (Op0 == Op1)
33620       return true;
33621
33622     // See if either operand has been extended from a smaller/equal size to
33623     // the truncation size, allowing a truncation to combine with the extend.
33624     unsigned Opcode0 = Op0.getOpcode();
33625     if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33626          Opcode0 == ISD::ZERO_EXTEND) &&
33627         Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33628       return true;
33629
33630     unsigned Opcode1 = Op1.getOpcode();
33631     if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33632          Opcode1 == ISD::ZERO_EXTEND) &&
33633         Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33634       return true;
33635
33636     // See if either operand is a single use constant which can be constant
33637     // folded.
33638     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33639     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33640     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33641            ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33642   };
33643
33644   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33645     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33646     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33647     return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33648   };
33649
33650   // Don't combine if the operation has other uses.
33651   if (!N->isOnlyUserOf(Src.getNode()))
33652     return SDValue();
33653
33654   // Only support vector truncation for now.
33655   // TODO: i64 scalar math would benefit as well.
33656   if (!VT.isVector())
33657     return SDValue();
33658
33659   // In most cases its only worth pre-truncating if we're only facing the cost
33660   // of one truncation.
33661   // i.e. if one of the inputs will constant fold or the input is repeated.
33662   switch (Opcode) {
33663   case ISD::AND:
33664   case ISD::XOR:
33665   case ISD::OR: {
33666     SDValue Op0 = Src.getOperand(0);
33667     SDValue Op1 = Src.getOperand(1);
33668     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33669         IsRepeatedOpOrFreeTruncation(Op0, Op1))
33670       return TruncateArithmetic(Op0, Op1);
33671     break;
33672   }
33673
33674   case ISD::MUL:
33675     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33676     // better to truncate if we have the chance.
33677     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33678         !TLI.isOperationLegal(Opcode, SrcVT))
33679       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33680     LLVM_FALLTHROUGH;
33681   case ISD::ADD: {
33682     SDValue Op0 = Src.getOperand(0);
33683     SDValue Op1 = Src.getOperand(1);
33684     if (TLI.isOperationLegal(Opcode, VT) &&
33685         IsRepeatedOpOrFreeTruncation(Op0, Op1))
33686       return TruncateArithmetic(Op0, Op1);
33687     break;
33688   }
33689   }
33690
33691   return SDValue();
33692 }
33693
33694 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33695 static SDValue
33696 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33697                                   SmallVector<SDValue, 8> &Regs) {
33698   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33699                              Regs[0].getValueType() == MVT::v2i64));
33700   EVT OutVT = N->getValueType(0);
33701   EVT OutSVT = OutVT.getVectorElementType();
33702   EVT InVT = Regs[0].getValueType();
33703   EVT InSVT = InVT.getVectorElementType();
33704   SDLoc DL(N);
33705
33706   // First, use mask to unset all bits that won't appear in the result.
33707   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33708          "OutSVT can only be either i8 or i16.");
33709   APInt Mask =
33710       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33711   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33712   for (auto &Reg : Regs)
33713     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33714
33715   MVT UnpackedVT, PackedVT;
33716   if (OutSVT == MVT::i8) {
33717     UnpackedVT = MVT::v8i16;
33718     PackedVT = MVT::v16i8;
33719   } else {
33720     UnpackedVT = MVT::v4i32;
33721     PackedVT = MVT::v8i16;
33722   }
33723
33724   // In each iteration, truncate the type by a half size.
33725   auto RegNum = Regs.size();
33726   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33727        j < e; j *= 2, RegNum /= 2) {
33728     for (unsigned i = 0; i < RegNum; i++)
33729       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33730     for (unsigned i = 0; i < RegNum / 2; i++)
33731       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33732                             Regs[i * 2 + 1]);
33733   }
33734
33735   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33736   // then extract a subvector as the result since v8i8 is not a legal type.
33737   if (OutVT == MVT::v8i8) {
33738     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33739     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33740                           DAG.getIntPtrConstant(0, DL));
33741     return Regs[0];
33742   } else if (RegNum > 1) {
33743     Regs.resize(RegNum);
33744     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33745   } else
33746     return Regs[0];
33747 }
33748
33749 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33750 static SDValue
33751 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33752                                   SelectionDAG &DAG,
33753                                   SmallVector<SDValue, 8> &Regs) {
33754   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33755   EVT OutVT = N->getValueType(0);
33756   SDLoc DL(N);
33757
33758   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33759   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33760   for (auto &Reg : Regs) {
33761     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33762                               Subtarget, DAG);
33763     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33764                               Subtarget, DAG);
33765   }
33766
33767   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33768     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33769                           Regs[i * 2 + 1]);
33770
33771   if (Regs.size() > 2) {
33772     Regs.resize(Regs.size() / 2);
33773     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33774   } else
33775     return Regs[0];
33776 }
33777
33778 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33779 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33780 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33781 /// element that is extracted from a vector and then truncated, and it is
33782 /// difficult to do this optimization based on them.
33783 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33784                                        const X86Subtarget &Subtarget) {
33785   EVT OutVT = N->getValueType(0);
33786   if (!OutVT.isVector())
33787     return SDValue();
33788
33789   SDValue In = N->getOperand(0);
33790   if (!In.getValueType().isSimple())
33791     return SDValue();
33792
33793   EVT InVT = In.getValueType();
33794   unsigned NumElems = OutVT.getVectorNumElements();
33795
33796   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33797   // SSE2, and we need to take care of it specially.
33798   // AVX512 provides vpmovdb.
33799   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33800     return SDValue();
33801
33802   EVT OutSVT = OutVT.getVectorElementType();
33803   EVT InSVT = InVT.getVectorElementType();
33804   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33805         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33806         NumElems >= 8))
33807     return SDValue();
33808
33809   // SSSE3's pshufb results in less instructions in the cases below.
33810   if (Subtarget.hasSSSE3() && NumElems == 8 &&
33811       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33812        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33813     return SDValue();
33814
33815   SDLoc DL(N);
33816
33817   // Split a long vector into vectors of legal type.
33818   unsigned RegNum = InVT.getSizeInBits() / 128;
33819   SmallVector<SDValue, 8> SubVec(RegNum);
33820   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33821   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33822
33823   for (unsigned i = 0; i < RegNum; i++)
33824     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33825                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33826
33827   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33828   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33829   // truncate 2 x v4i32 to v8i16.
33830   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33831     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33832   else if (InSVT == MVT::i32)
33833     return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33834   else
33835     return SDValue();
33836 }
33837
33838 /// This function transforms vector truncation of 'all or none' bits values.
33839 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33840 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33841                                                SelectionDAG &DAG,
33842                                                const X86Subtarget &Subtarget) {
33843   // Requires SSE2 but AVX512 has fast truncate.
33844   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33845     return SDValue();
33846
33847   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33848     return SDValue();
33849
33850   SDValue In = N->getOperand(0);
33851   if (!In.getValueType().isSimple())
33852     return SDValue();
33853
33854   MVT VT = N->getValueType(0).getSimpleVT();
33855   MVT SVT = VT.getScalarType();
33856
33857   MVT InVT = In.getValueType().getSimpleVT();
33858   MVT InSVT = InVT.getScalarType();
33859
33860   // Use PACKSS if the input is a splatted sign bit.
33861   // e.g. Comparison result, sext_in_reg, etc.
33862   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33863   if (NumSignBits != InSVT.getSizeInBits())
33864     return SDValue();
33865
33866   // Check we have a truncation suited for PACKSS.
33867   if (!VT.is128BitVector() && !VT.is256BitVector())
33868     return SDValue();
33869   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33870     return SDValue();
33871   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33872     return SDValue();
33873
33874   return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33875 }
33876
33877 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33878                                const X86Subtarget &Subtarget) {
33879   EVT VT = N->getValueType(0);
33880   SDValue Src = N->getOperand(0);
33881   SDLoc DL(N);
33882
33883   // Attempt to pre-truncate inputs to arithmetic ops instead.
33884   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33885     return V;
33886
33887   // Try to detect AVG pattern first.
33888   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33889     return Avg;
33890
33891   // Try to combine truncation with unsigned saturation.
33892   if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33893     return Val;
33894
33895   // The bitcast source is a direct mmx result.
33896   // Detect bitcasts between i32 to x86mmx
33897   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33898     SDValue BCSrc = Src.getOperand(0);
33899     if (BCSrc.getValueType() == MVT::x86mmx)
33900       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33901   }
33902
33903   // Try to truncate extended sign bits with PACKSS.
33904   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33905     return V;
33906
33907   return combineVectorTruncation(N, DAG, Subtarget);
33908 }
33909
33910 /// Returns the negated value if the node \p N flips sign of FP value.
33911 ///
33912 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33913 /// AVX512F does not have FXOR, so FNEG is lowered as
33914 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33915 /// In this case we go though all bitcasts.
33916 static SDValue isFNEG(SDNode *N) {
33917   if (N->getOpcode() == ISD::FNEG)
33918     return N->getOperand(0);
33919
33920   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33921   if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33922     return SDValue();
33923
33924   SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33925   if (!Op1.getValueType().isFloatingPoint())
33926     return SDValue();
33927
33928   SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33929
33930   unsigned EltBits = Op1.getScalarValueSizeInBits();
33931   auto isSignMask = [&](const ConstantFP *C) {
33932     return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33933   };
33934
33935   // There is more than one way to represent the same constant on
33936   // the different X86 targets. The type of the node may also depend on size.
33937   //  - load scalar value and broadcast
33938   //  - BUILD_VECTOR node
33939   //  - load from a constant pool.
33940   // We check all variants here.
33941   if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33942     if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33943       if (isSignMask(cast<ConstantFP>(C)))
33944         return Op0;
33945
33946   } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33947     if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33948       if (isSignMask(CN->getConstantFPValue()))
33949         return Op0;
33950
33951   } else if (auto *C = getTargetConstantFromNode(Op1)) {
33952     if (C->getType()->isVectorTy()) {
33953       if (auto *SplatV = C->getSplatValue())
33954         if (isSignMask(cast<ConstantFP>(SplatV)))
33955           return Op0;
33956     } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33957       if (isSignMask(FPConst))
33958         return Op0;
33959   }
33960   return SDValue();
33961 }
33962
33963 /// Do target-specific dag combines on floating point negations.
33964 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33965                            const X86Subtarget &Subtarget) {
33966   EVT OrigVT = N->getValueType(0);
33967   SDValue Arg = isFNEG(N);
33968   assert(Arg.getNode() && "N is expected to be an FNEG node");
33969
33970   EVT VT = Arg.getValueType();
33971   EVT SVT = VT.getScalarType();
33972   SDLoc DL(N);
33973
33974   // Let legalize expand this if it isn't a legal type yet.
33975   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33976     return SDValue();
33977
33978   // If we're negating a FMUL node on a target with FMA, then we can avoid the
33979   // use of a constant by performing (-0 - A*B) instead.
33980   // FIXME: Check rounding control flags as well once it becomes available.
33981   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33982       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33983     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33984     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33985                                   Arg.getOperand(1), Zero);
33986     return DAG.getBitcast(OrigVT, NewNode);
33987   }
33988
33989   // If we're negating an FMA node, then we can adjust the
33990   // instruction to include the extra negation.
33991   unsigned NewOpcode = 0;
33992   if (Arg.hasOneUse()) {
33993     switch (Arg.getOpcode()) {
33994     case X86ISD::FMADD:        NewOpcode = X86ISD::FNMSUB;       break;
33995     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
33996     case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
33997     case X86ISD::FNMSUB:       NewOpcode = X86ISD::FMADD;        break;
33998     case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
33999     case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
34000     case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
34001     case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
34002     // We can't handle scalar intrinsic node here because it would only
34003     // invert one element and not the whole vector. But we could try to handle
34004     // a negation of the lower element only.
34005     }
34006   }
34007   if (NewOpcode)
34008     return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
34009                                               Arg.getNode()->ops()));
34010
34011   return SDValue();
34012 }
34013
34014 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
34015                                  const X86Subtarget &Subtarget) {
34016   MVT VT = N->getSimpleValueType(0);
34017   // If we have integer vector types available, use the integer opcodes.
34018   if (VT.isVector() && Subtarget.hasSSE2()) {
34019     SDLoc dl(N);
34020
34021     MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
34022
34023     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
34024     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
34025     unsigned IntOpcode;
34026     switch (N->getOpcode()) {
34027     default: llvm_unreachable("Unexpected FP logic op");
34028     case X86ISD::FOR: IntOpcode = ISD::OR; break;
34029     case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34030     case X86ISD::FAND: IntOpcode = ISD::AND; break;
34031     case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34032     }
34033     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34034     return DAG.getBitcast(VT, IntOp);
34035   }
34036   return SDValue();
34037 }
34038
34039 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34040                           TargetLowering::DAGCombinerInfo &DCI,
34041                           const X86Subtarget &Subtarget) {
34042   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
34043     return Cmp;
34044
34045   if (DCI.isBeforeLegalizeOps())
34046     return SDValue();
34047
34048   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
34049     return RV;
34050
34051   if (Subtarget.hasCMov())
34052     if (SDValue RV = combineIntegerAbs(N, DAG))
34053       return RV;
34054
34055   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34056     return FPLogic;
34057
34058   if (isFNEG(N))
34059     return combineFneg(N, DAG, Subtarget);
34060   return SDValue();
34061 }
34062
34063
34064 static bool isNullFPScalarOrVectorConst(SDValue V) {
34065   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
34066 }
34067
34068 /// If a value is a scalar FP zero or a vector FP zero (potentially including
34069 /// undefined elements), return a zero constant that may be used to fold away
34070 /// that value. In the case of a vector, the returned constant will not contain
34071 /// undefined elements even if the input parameter does. This makes it suitable
34072 /// to be used as a replacement operand with operations (eg, bitwise-and) where
34073 /// an undef should not propagate.
34074 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
34075                                         const X86Subtarget &Subtarget) {
34076   if (!isNullFPScalarOrVectorConst(V))
34077     return SDValue();
34078
34079   if (V.getValueType().isVector())
34080     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
34081
34082   return V;
34083 }
34084
34085 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
34086                                       const X86Subtarget &Subtarget) {
34087   SDValue N0 = N->getOperand(0);
34088   SDValue N1 = N->getOperand(1);
34089   EVT VT = N->getValueType(0);
34090   SDLoc DL(N);
34091
34092   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
34093   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
34094         (VT == MVT::f64 && Subtarget.hasSSE2())))
34095     return SDValue();
34096
34097   auto isAllOnesConstantFP = [](SDValue V) {
34098     auto *C = dyn_cast<ConstantFPSDNode>(V);
34099     return C && C->getConstantFPValue()->isAllOnesValue();
34100   };
34101
34102   // fand (fxor X, -1), Y --> fandn X, Y
34103   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
34104     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
34105
34106   // fand X, (fxor Y, -1) --> fandn Y, X
34107   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
34108     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
34109
34110   return SDValue();
34111 }
34112
34113 /// Do target-specific dag combines on X86ISD::FAND nodes.
34114 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
34115                            const X86Subtarget &Subtarget) {
34116   // FAND(0.0, x) -> 0.0
34117   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
34118     return V;
34119
34120   // FAND(x, 0.0) -> 0.0
34121   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34122     return V;
34123
34124   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
34125     return V;
34126
34127   return lowerX86FPLogicOp(N, DAG, Subtarget);
34128 }
34129
34130 /// Do target-specific dag combines on X86ISD::FANDN nodes.
34131 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
34132                             const X86Subtarget &Subtarget) {
34133   // FANDN(0.0, x) -> x
34134   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34135     return N->getOperand(1);
34136
34137   // FANDN(x, 0.0) -> 0.0
34138   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34139     return V;
34140
34141   return lowerX86FPLogicOp(N, DAG, Subtarget);
34142 }
34143
34144 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34145 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34146                           const X86Subtarget &Subtarget) {
34147   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
34148
34149   // F[X]OR(0.0, x) -> x
34150   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34151     return N->getOperand(1);
34152
34153   // F[X]OR(x, 0.0) -> x
34154   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
34155     return N->getOperand(0);
34156
34157   if (isFNEG(N))
34158     if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
34159       return NewVal;
34160
34161   return lowerX86FPLogicOp(N, DAG, Subtarget);
34162 }
34163
34164 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34165 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34166   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
34167
34168   // Only perform optimizations if UnsafeMath is used.
34169   if (!DAG.getTarget().Options.UnsafeFPMath)
34170     return SDValue();
34171
34172   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34173   // into FMINC and FMAXC, which are Commutative operations.
34174   unsigned NewOp = 0;
34175   switch (N->getOpcode()) {
34176     default: llvm_unreachable("unknown opcode");
34177     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
34178     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
34179   }
34180
34181   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34182                      N->getOperand(0), N->getOperand(1));
34183 }
34184
34185 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34186                                      const X86Subtarget &Subtarget) {
34187   if (Subtarget.useSoftFloat())
34188     return SDValue();
34189
34190   // TODO: Check for global or instruction-level "nnan". In that case, we
34191   //       should be able to lower to FMAX/FMIN alone.
34192   // TODO: If an operand is already known to be a NaN or not a NaN, this
34193   //       should be an optional swap and FMAX/FMIN.
34194
34195   EVT VT = N->getValueType(0);
34196   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
34197         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
34198         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
34199     return SDValue();
34200
34201   // This takes at least 3 instructions, so favor a library call when operating
34202   // on a scalar and minimizing code size.
34203   if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
34204     return SDValue();
34205
34206   SDValue Op0 = N->getOperand(0);
34207   SDValue Op1 = N->getOperand(1);
34208   SDLoc DL(N);
34209   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34210       DAG.getDataLayout(), *DAG.getContext(), VT);
34211
34212   // There are 4 possibilities involving NaN inputs, and these are the required
34213   // outputs:
34214   //                   Op1
34215   //               Num     NaN
34216   //            ----------------
34217   //       Num  |  Max  |  Op0 |
34218   // Op0        ----------------
34219   //       NaN  |  Op1  |  NaN |
34220   //            ----------------
34221   //
34222   // The SSE FP max/min instructions were not designed for this case, but rather
34223   // to implement:
34224   //   Min = Op1 < Op0 ? Op1 : Op0
34225   //   Max = Op1 > Op0 ? Op1 : Op0
34226   //
34227   // So they always return Op0 if either input is a NaN. However, we can still
34228   // use those instructions for fmaxnum by selecting away a NaN input.
34229
34230   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
34231   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
34232   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34233   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34234
34235   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34236   // are NaN, the NaN value of Op1 is the result.
34237   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34238 }
34239
34240 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
34241 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34242                             TargetLowering::DAGCombinerInfo &DCI,
34243                             const X86Subtarget &Subtarget) {
34244   // ANDNP(0, x) -> x
34245   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34246     return N->getOperand(1);
34247
34248   // ANDNP(x, 0) -> 0
34249   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
34250     return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34251
34252   EVT VT = N->getValueType(0);
34253
34254   // Attempt to recursively combine a bitmask ANDNP with shuffles.
34255   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34256     SDValue Op(N, 0);
34257     SmallVector<int, 1> NonceMask; // Just a placeholder.
34258     NonceMask.push_back(0);
34259     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
34260                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
34261                                       DCI, Subtarget))
34262       return SDValue(); // This routine will use CombineTo to replace N.
34263   }
34264
34265   return SDValue();
34266 }
34267
34268 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34269                          TargetLowering::DAGCombinerInfo &DCI) {
34270   // BT ignores high bits in the bit index operand.
34271   SDValue Op1 = N->getOperand(1);
34272   if (Op1.hasOneUse()) {
34273     unsigned BitWidth = Op1.getValueSizeInBits();
34274     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34275     KnownBits Known;
34276     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34277                                           !DCI.isBeforeLegalizeOps());
34278     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34279     if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
34280         TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
34281       DCI.CommitTargetLoweringOpt(TLO);
34282   }
34283   return SDValue();
34284 }
34285
34286 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34287                                       const X86Subtarget &Subtarget) {
34288   EVT VT = N->getValueType(0);
34289   if (!VT.isVector())
34290     return SDValue();
34291
34292   SDValue N0 = N->getOperand(0);
34293   SDValue N1 = N->getOperand(1);
34294   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34295   SDLoc dl(N);
34296
34297   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34298   // both SSE and AVX2 since there is no sign-extended shift right
34299   // operation on a vector with 64-bit elements.
34300   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34301   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34302   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
34303       N0.getOpcode() == ISD::SIGN_EXTEND)) {
34304     SDValue N00 = N0.getOperand(0);
34305
34306     // EXTLOAD has a better solution on AVX2,
34307     // it may be replaced with X86ISD::VSEXT node.
34308     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
34309       if (!ISD::isNormalLoad(N00.getNode()))
34310         return SDValue();
34311
34312     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
34313         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34314                                   N00, N1);
34315       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34316     }
34317   }
34318   return SDValue();
34319 }
34320
34321 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34322 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34323 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34324 /// opportunities to combine math ops, use an LEA, or use a complex addressing
34325 /// mode. This can eliminate extend, add, and shift instructions.
34326 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34327                                    const X86Subtarget &Subtarget) {
34328   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34329       Ext->getOpcode() != ISD::ZERO_EXTEND)
34330     return SDValue();
34331
34332   // TODO: This should be valid for other integer types.
34333   EVT VT = Ext->getValueType(0);
34334   if (VT != MVT::i64)
34335     return SDValue();
34336
34337   SDValue Add = Ext->getOperand(0);
34338   if (Add.getOpcode() != ISD::ADD)
34339     return SDValue();
34340
34341   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34342   bool NSW = Add->getFlags().hasNoSignedWrap();
34343   bool NUW = Add->getFlags().hasNoUnsignedWrap();
34344
34345   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34346   // into the 'zext'
34347   if ((Sext && !NSW) || (!Sext && !NUW))
34348     return SDValue();
34349
34350   // Having a constant operand to the 'add' ensures that we are not increasing
34351   // the instruction count because the constant is extended for free below.
34352   // A constant operand can also become the displacement field of an LEA.
34353   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34354   if (!AddOp1)
34355     return SDValue();
34356
34357   // Don't make the 'add' bigger if there's no hope of combining it with some
34358   // other 'add' or 'shl' instruction.
34359   // TODO: It may be profitable to generate simpler LEA instructions in place
34360   // of single 'add' instructions, but the cost model for selecting an LEA
34361   // currently has a high threshold.
34362   bool HasLEAPotential = false;
34363   for (auto *User : Ext->uses()) {
34364     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
34365       HasLEAPotential = true;
34366       break;
34367     }
34368   }
34369   if (!HasLEAPotential)
34370     return SDValue();
34371
34372   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
34373   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
34374   SDValue AddOp0 = Add.getOperand(0);
34375   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34376   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34377
34378   // The wider add is guaranteed to not wrap because both operands are
34379   // sign-extended.
34380   SDNodeFlags Flags;
34381   Flags.setNoSignedWrap(NSW);
34382   Flags.setNoUnsignedWrap(NUW);
34383   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34384 }
34385
34386 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34387 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34388 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34389 /// extends from AH (which we otherwise need to do contortions to access).
34390 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34391   SDValue N0 = N->getOperand(0);
34392   auto OpcodeN = N->getOpcode();
34393   auto OpcodeN0 = N0.getOpcode();
34394   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
34395         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
34396     return SDValue();
34397
34398   EVT VT = N->getValueType(0);
34399   EVT InVT = N0.getValueType();
34400   if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
34401     return SDValue();
34402
34403   SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34404   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34405                                                : X86ISD::UDIVREM8_ZEXT_HREG;
34406   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34407                           N0.getOperand(1));
34408   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34409   return R.getValue(1);
34410 }
34411
34412 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34413 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34414 /// with UNDEFs) of the input to vectors of the same size as the target type
34415 /// which then extends the lowest elements.
34416 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34417                                           TargetLowering::DAGCombinerInfo &DCI,
34418                                           const X86Subtarget &Subtarget) {
34419   unsigned Opcode = N->getOpcode();
34420   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
34421     return SDValue();
34422   if (!DCI.isBeforeLegalizeOps())
34423     return SDValue();
34424   if (!Subtarget.hasSSE2())
34425     return SDValue();
34426
34427   SDValue N0 = N->getOperand(0);
34428   EVT VT = N->getValueType(0);
34429   EVT SVT = VT.getScalarType();
34430   EVT InVT = N0.getValueType();
34431   EVT InSVT = InVT.getScalarType();
34432
34433   // Input type must be a vector and we must be extending legal integer types.
34434   if (!VT.isVector())
34435     return SDValue();
34436   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
34437     return SDValue();
34438   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34439     return SDValue();
34440
34441   // On AVX2+ targets, if the input/output types are both legal then we will be
34442   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34443   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34444       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34445     return SDValue();
34446
34447   SDLoc DL(N);
34448
34449   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34450     EVT InVT = N.getValueType();
34451     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34452                                  Size / InVT.getScalarSizeInBits());
34453     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34454                                   DAG.getUNDEF(InVT));
34455     Opnds[0] = N;
34456     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34457   };
34458
34459   // If target-size is less than 128-bits, extend to a type that would extend
34460   // to 128 bits, extend that and extract the original target vector.
34461   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34462     unsigned Scale = 128 / VT.getSizeInBits();
34463     EVT ExVT =
34464         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34465     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34466     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34467     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34468                        DAG.getIntPtrConstant(0, DL));
34469   }
34470
34471   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34472   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34473   // Also use this if we don't have SSE41 to allow the legalizer do its job.
34474   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34475       (VT.is256BitVector() && Subtarget.hasInt256()) ||
34476       (VT.is512BitVector() && Subtarget.hasAVX512())) {
34477     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34478     return Opcode == ISD::SIGN_EXTEND
34479                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34480                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34481   }
34482
34483   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34484     unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34485     unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34486     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34487     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34488
34489     SmallVector<SDValue, 8> Opnds;
34490     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34491       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34492                                    DAG.getIntPtrConstant(Offset, DL));
34493       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34494       SrcVec = Opcode == ISD::SIGN_EXTEND
34495                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34496                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34497       Opnds.push_back(SrcVec);
34498     }
34499     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34500   };
34501
34502   // On pre-AVX2 targets, split into 128-bit nodes of
34503   // ISD::*_EXTEND_VECTOR_INREG.
34504   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34505     return SplitAndExtendInReg(128);
34506
34507   // On pre-AVX512 targets, split into 256-bit nodes of
34508   // ISD::*_EXTEND_VECTOR_INREG.
34509   if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34510     return SplitAndExtendInReg(256);
34511
34512   return SDValue();
34513 }
34514
34515 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34516                            TargetLowering::DAGCombinerInfo &DCI,
34517                            const X86Subtarget &Subtarget) {
34518   SDValue N0 = N->getOperand(0);
34519   EVT VT = N->getValueType(0);
34520   EVT InVT = N0.getValueType();
34521   SDLoc DL(N);
34522
34523   if (SDValue DivRem8 = getDivRem8(N, DAG))
34524     return DivRem8;
34525
34526   if (!DCI.isBeforeLegalizeOps()) {
34527     if (InVT == MVT::i1) {
34528       SDValue Zero = DAG.getConstant(0, DL, VT);
34529       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34530       return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34531     }
34532     return SDValue();
34533   }
34534
34535   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34536       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34537     // Invert and sign-extend a boolean is the same as zero-extend and subtract
34538     // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34539     // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34540     // sext (xor Bool, -1) --> sub (zext Bool), 1
34541     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34542     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34543   }
34544
34545   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34546     return V;
34547
34548   if (Subtarget.hasAVX() && VT.is256BitVector())
34549     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34550       return R;
34551
34552   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34553     return NewAdd;
34554
34555   return SDValue();
34556 }
34557
34558 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34559                           const X86Subtarget &Subtarget) {
34560   SDLoc dl(N);
34561   EVT VT = N->getValueType(0);
34562
34563   // Let legalize expand this if it isn't a legal type yet.
34564   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34565     return SDValue();
34566
34567   EVT ScalarVT = VT.getScalarType();
34568   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34569     return SDValue();
34570
34571   SDValue A = N->getOperand(0);
34572   SDValue B = N->getOperand(1);
34573   SDValue C = N->getOperand(2);
34574
34575   auto invertIfNegative = [](SDValue &V) {
34576     if (SDValue NegVal = isFNEG(V.getNode())) {
34577       V = NegVal;
34578       return true;
34579     }
34580     return false;
34581   };
34582
34583   // Do not convert the passthru input of scalar intrinsics.
34584   // FIXME: We could allow negations of the lower element only.
34585   bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34586   bool NegB = invertIfNegative(B);
34587   bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34588
34589   // Negative multiplication when NegA xor NegB
34590   bool NegMul = (NegA != NegB);
34591
34592   unsigned NewOpcode;
34593   if (!NegMul)
34594     NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34595   else
34596     NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34597
34598
34599   if (N->getOpcode() == X86ISD::FMADD_RND) {
34600     switch (NewOpcode) {
34601     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADD_RND; break;
34602     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
34603     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34604     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34605     }
34606   } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34607     switch (NewOpcode) {
34608     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS1_RND; break;
34609     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
34610     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34611     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34612     }
34613   } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34614     switch (NewOpcode) {
34615     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS3_RND; break;
34616     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
34617     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34618     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34619     }
34620   } else {
34621     assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34622            "Unexpected opcode!");
34623     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34624   }
34625
34626   return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34627 }
34628
34629 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34630                            TargetLowering::DAGCombinerInfo &DCI,
34631                            const X86Subtarget &Subtarget) {
34632   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
34633   //           (and (i32 x86isd::setcc_carry), 1)
34634   // This eliminates the zext. This transformation is necessary because
34635   // ISD::SETCC is always legalized to i8.
34636   SDLoc dl(N);
34637   SDValue N0 = N->getOperand(0);
34638   EVT VT = N->getValueType(0);
34639
34640   if (N0.getOpcode() == ISD::AND &&
34641       N0.hasOneUse() &&
34642       N0.getOperand(0).hasOneUse()) {
34643     SDValue N00 = N0.getOperand(0);
34644     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34645       if (!isOneConstant(N0.getOperand(1)))
34646         return SDValue();
34647       return DAG.getNode(ISD::AND, dl, VT,
34648                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34649                                      N00.getOperand(0), N00.getOperand(1)),
34650                          DAG.getConstant(1, dl, VT));
34651     }
34652   }
34653
34654   if (N0.getOpcode() == ISD::TRUNCATE &&
34655       N0.hasOneUse() &&
34656       N0.getOperand(0).hasOneUse()) {
34657     SDValue N00 = N0.getOperand(0);
34658     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34659       return DAG.getNode(ISD::AND, dl, VT,
34660                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34661                                      N00.getOperand(0), N00.getOperand(1)),
34662                          DAG.getConstant(1, dl, VT));
34663     }
34664   }
34665
34666   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34667     return V;
34668
34669   if (VT.is256BitVector())
34670     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34671       return R;
34672
34673   if (SDValue DivRem8 = getDivRem8(N, DAG))
34674     return DivRem8;
34675
34676   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34677     return NewAdd;
34678
34679   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34680     return R;
34681
34682   return SDValue();
34683 }
34684
34685 /// Try to map a 128-bit or larger integer comparison to vector instructions
34686 /// before type legalization splits it up into chunks.
34687 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34688                                                const X86Subtarget &Subtarget) {
34689   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34690   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34691
34692   // We're looking for an oversized integer equality comparison, but ignore a
34693   // comparison with zero because that gets special treatment in EmitTest().
34694   SDValue X = SetCC->getOperand(0);
34695   SDValue Y = SetCC->getOperand(1);
34696   EVT OpVT = X.getValueType();
34697   unsigned OpSize = OpVT.getSizeInBits();
34698   if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34699     return SDValue();
34700
34701   // TODO: Use PXOR + PTEST for SSE4.1 or later?
34702   // TODO: Add support for AVX-512.
34703   EVT VT = SetCC->getValueType(0);
34704   SDLoc DL(SetCC);
34705   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34706       (OpSize == 256 && Subtarget.hasAVX2())) {
34707     EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34708     SDValue VecX = DAG.getBitcast(VecVT, X);
34709     SDValue VecY = DAG.getBitcast(VecVT, Y);
34710
34711     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34712     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34713     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34714     // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34715     // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34716     SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34717     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34718     SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34719                                     MVT::i32);
34720     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34721   }
34722
34723   return SDValue();
34724 }
34725
34726 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34727                             const X86Subtarget &Subtarget) {
34728   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34729   SDValue LHS = N->getOperand(0);
34730   SDValue RHS = N->getOperand(1);
34731   EVT VT = N->getValueType(0);
34732   SDLoc DL(N);
34733
34734   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34735     EVT OpVT = LHS.getValueType();
34736     // 0-x == y --> x+y == 0
34737     // 0-x != y --> x+y != 0
34738     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34739         LHS.hasOneUse()) {
34740       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34741       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34742     }
34743     // x == 0-y --> x+y == 0
34744     // x != 0-y --> x+y != 0
34745     if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34746         RHS.hasOneUse()) {
34747       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34748       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34749     }
34750
34751     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34752       return V;
34753   }
34754
34755   if (VT.getScalarType() == MVT::i1 &&
34756       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34757     bool IsSEXT0 =
34758         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34759         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34760     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34761
34762     if (!IsSEXT0 || !IsVZero1) {
34763       // Swap the operands and update the condition code.
34764       std::swap(LHS, RHS);
34765       CC = ISD::getSetCCSwappedOperands(CC);
34766
34767       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34768                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34769       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34770     }
34771
34772     if (IsSEXT0 && IsVZero1) {
34773       assert(VT == LHS.getOperand(0).getValueType() &&
34774              "Uexpected operand type");
34775       if (CC == ISD::SETGT)
34776         return DAG.getConstant(0, DL, VT);
34777       if (CC == ISD::SETLE)
34778         return DAG.getConstant(1, DL, VT);
34779       if (CC == ISD::SETEQ || CC == ISD::SETGE)
34780         return DAG.getNOT(DL, LHS.getOperand(0), VT);
34781
34782       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34783              "Unexpected condition code!");
34784       return LHS.getOperand(0);
34785     }
34786   }
34787
34788   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34789   // to avoid scalarization via legalization because v4i32 is not a legal type.
34790   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34791       LHS.getValueType() == MVT::v4f32)
34792     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34793
34794   return SDValue();
34795 }
34796
34797 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34798   SDLoc DL(N);
34799   // Gather and Scatter instructions use k-registers for masks. The type of
34800   // the masks is v*i1. So the mask will be truncated anyway.
34801   // The SIGN_EXTEND_INREG my be dropped.
34802   SDValue Mask = N->getOperand(2);
34803   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34804     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34805     NewOps[2] = Mask.getOperand(0);
34806     DAG.UpdateNodeOperands(N, NewOps);
34807   }
34808   return SDValue();
34809 }
34810
34811 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34812 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34813                                const X86Subtarget &Subtarget) {
34814   SDLoc DL(N);
34815   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34816   SDValue EFLAGS = N->getOperand(1);
34817
34818   // Try to simplify the EFLAGS and condition code operands.
34819   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34820     return getSETCC(CC, Flags, DL, DAG);
34821
34822   return SDValue();
34823 }
34824
34825 /// Optimize branch condition evaluation.
34826 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34827                              const X86Subtarget &Subtarget) {
34828   SDLoc DL(N);
34829   SDValue EFLAGS = N->getOperand(3);
34830   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34831
34832   // Try to simplify the EFLAGS and condition code operands.
34833   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34834   // RAUW them under us.
34835   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34836     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34837     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34838                        N->getOperand(1), Cond, Flags);
34839   }
34840
34841   return SDValue();
34842 }
34843
34844 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34845                                                   SelectionDAG &DAG) {
34846   // Take advantage of vector comparisons producing 0 or -1 in each lane to
34847   // optimize away operation when it's from a constant.
34848   //
34849   // The general transformation is:
34850   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34851   //       AND(VECTOR_CMP(x,y), constant2)
34852   //    constant2 = UNARYOP(constant)
34853
34854   // Early exit if this isn't a vector operation, the operand of the
34855   // unary operation isn't a bitwise AND, or if the sizes of the operations
34856   // aren't the same.
34857   EVT VT = N->getValueType(0);
34858   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34859       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34860       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34861     return SDValue();
34862
34863   // Now check that the other operand of the AND is a constant. We could
34864   // make the transformation for non-constant splats as well, but it's unclear
34865   // that would be a benefit as it would not eliminate any operations, just
34866   // perform one more step in scalar code before moving to the vector unit.
34867   if (BuildVectorSDNode *BV =
34868           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34869     // Bail out if the vector isn't a constant.
34870     if (!BV->isConstant())
34871       return SDValue();
34872
34873     // Everything checks out. Build up the new and improved node.
34874     SDLoc DL(N);
34875     EVT IntVT = BV->getValueType(0);
34876     // Create a new constant of the appropriate type for the transformed
34877     // DAG.
34878     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34879     // The AND node needs bitcasts to/from an integer vector type around it.
34880     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34881     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34882                                  N->getOperand(0)->getOperand(0), MaskConst);
34883     SDValue Res = DAG.getBitcast(VT, NewAnd);
34884     return Res;
34885   }
34886
34887   return SDValue();
34888 }
34889
34890 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34891                                const X86Subtarget &Subtarget) {
34892   SDValue Op0 = N->getOperand(0);
34893   EVT VT = N->getValueType(0);
34894   EVT InVT = Op0.getValueType();
34895   EVT InSVT = InVT.getScalarType();
34896   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34897
34898   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34899   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34900   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34901     SDLoc dl(N);
34902     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34903                                  InVT.getVectorNumElements());
34904     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34905
34906     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34907       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34908
34909     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34910   }
34911
34912   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34913   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34914   // the optimization here.
34915   if (DAG.SignBitIsZero(Op0))
34916     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34917
34918   return SDValue();
34919 }
34920
34921 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34922                                const X86Subtarget &Subtarget) {
34923   // First try to optimize away the conversion entirely when it's
34924   // conditionally from a constant. Vectors only.
34925   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34926     return Res;
34927
34928   // Now move on to more general possibilities.
34929   SDValue Op0 = N->getOperand(0);
34930   EVT VT = N->getValueType(0);
34931   EVT InVT = Op0.getValueType();
34932   EVT InSVT = InVT.getScalarType();
34933
34934   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34935   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34936   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34937   if (InVT.isVector() &&
34938       (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34939        (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34940     SDLoc dl(N);
34941     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34942                                  InVT.getVectorNumElements());
34943     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34944     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34945   }
34946
34947   // Without AVX512DQ we only support i64 to float scalar conversion. For both
34948   // vectors and scalars, see if we know that the upper bits are all the sign
34949   // bit, in which case we can truncate the input to i32 and convert from that.
34950   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34951     unsigned BitWidth = InVT.getScalarSizeInBits();
34952     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34953     if (NumSignBits >= (BitWidth - 31)) {
34954       EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34955       if (InVT.isVector())
34956         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34957                                    InVT.getVectorNumElements());
34958       SDLoc dl(N);
34959       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34960       return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34961     }
34962   }
34963
34964   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34965   // a 32-bit target where SSE doesn't support i64->FP operations.
34966   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34967     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34968     EVT LdVT = Ld->getValueType(0);
34969
34970     // This transformation is not supported if the result type is f16 or f128.
34971     if (VT == MVT::f16 || VT == MVT::f128)
34972       return SDValue();
34973
34974     if (!Ld->isVolatile() && !VT.isVector() &&
34975         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34976         !Subtarget.is64Bit() && LdVT == MVT::i64) {
34977       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34978           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34979       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34980       return FILDChain;
34981     }
34982   }
34983   return SDValue();
34984 }
34985
34986 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34987 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34988                              X86TargetLowering::DAGCombinerInfo &DCI) {
34989   // When legalizing carry, we create carries via add X, -1
34990   // If that comes from an actual carry, via setcc, we use the
34991   // carry directly.
34992   if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34993     SDValue Carry = N->getOperand(0);
34994     while (Carry.getOpcode() == ISD::TRUNCATE ||
34995            Carry.getOpcode() == ISD::ZERO_EXTEND ||
34996            Carry.getOpcode() == ISD::SIGN_EXTEND ||
34997            Carry.getOpcode() == ISD::ANY_EXTEND ||
34998            (Carry.getOpcode() == ISD::AND &&
34999             isOneConstant(Carry.getOperand(1))))
35000       Carry = Carry.getOperand(0);
35001
35002     if (Carry.getOpcode() == X86ISD::SETCC ||
35003         Carry.getOpcode() == X86ISD::SETCC_CARRY) {
35004       if (Carry.getConstantOperandVal(0) == X86::COND_B)
35005         return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
35006     }
35007   }
35008
35009   return SDValue();
35010 }
35011
35012 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
35013 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
35014                           X86TargetLowering::DAGCombinerInfo &DCI) {
35015   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
35016   // the result is either zero or one (depending on the input carry bit).
35017   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
35018   if (X86::isZeroNode(N->getOperand(0)) &&
35019       X86::isZeroNode(N->getOperand(1)) &&
35020       // We don't have a good way to replace an EFLAGS use, so only do this when
35021       // dead right now.
35022       SDValue(N, 1).use_empty()) {
35023     SDLoc DL(N);
35024     EVT VT = N->getValueType(0);
35025     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
35026     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
35027                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35028                                            DAG.getConstant(X86::COND_B, DL,
35029                                                            MVT::i8),
35030                                            N->getOperand(2)),
35031                                DAG.getConstant(1, DL, VT));
35032     return DCI.CombineTo(N, Res1, CarryOut);
35033   }
35034
35035   return SDValue();
35036 }
35037
35038 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
35039 /// which is more useful than 0/1 in some cases.
35040 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
35041   SDLoc DL(N);
35042   // "Condition code B" is also known as "the carry flag" (CF).
35043   SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
35044   SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
35045   MVT VT = N->getSimpleValueType(0);
35046   if (VT == MVT::i8)
35047     return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
35048
35049   assert(VT == MVT::i1 && "Unexpected type for SETCC node");
35050   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
35051 }
35052
35053 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
35054 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
35055 /// with CMP+{ADC, SBB}.
35056 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
35057   bool IsSub = N->getOpcode() == ISD::SUB;
35058   SDValue X = N->getOperand(0);
35059   SDValue Y = N->getOperand(1);
35060
35061   // If this is an add, canonicalize a zext operand to the RHS.
35062   // TODO: Incomplete? What if both sides are zexts?
35063   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
35064       Y.getOpcode() != ISD::ZERO_EXTEND)
35065     std::swap(X, Y);
35066
35067   // Look through a one-use zext.
35068   bool PeekedThroughZext = false;
35069   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
35070     Y = Y.getOperand(0);
35071     PeekedThroughZext = true;
35072   }
35073
35074   // If this is an add, canonicalize a setcc operand to the RHS.
35075   // TODO: Incomplete? What if both sides are setcc?
35076   // TODO: Should we allow peeking through a zext of the other operand?
35077   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
35078       Y.getOpcode() != X86ISD::SETCC)
35079     std::swap(X, Y);
35080
35081   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
35082     return SDValue();
35083
35084   SDLoc DL(N);
35085   EVT VT = N->getValueType(0);
35086   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
35087
35088   // If X is -1 or 0, then we have an opportunity to avoid constants required in
35089   // the general case below.
35090   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
35091   if (ConstantX) {
35092     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
35093         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
35094       // This is a complicated way to get -1 or 0 from the carry flag:
35095       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35096       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35097       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35098                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
35099                          Y.getOperand(1));
35100     }
35101
35102     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
35103         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
35104       SDValue EFLAGS = Y->getOperand(1);
35105       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35106           EFLAGS.getValueType().isInteger() &&
35107           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35108         // Swap the operands of a SUB, and we have the same pattern as above.
35109         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
35110         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
35111         SDValue NewSub = DAG.getNode(
35112             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
35113             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35114         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35115         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35116                            DAG.getConstant(X86::COND_B, DL, MVT::i8),
35117                            NewEFLAGS);
35118       }
35119     }
35120   }
35121
35122   if (CC == X86::COND_B) {
35123     // X + SETB Z --> X + (mask SBB Z, Z)
35124     // X - SETB Z --> X - (mask SBB Z, Z)
35125     // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
35126     SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
35127     if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35128       SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35129     return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35130   }
35131
35132   if (CC == X86::COND_A) {
35133     SDValue EFLAGS = Y->getOperand(1);
35134     // Try to convert COND_A into COND_B in an attempt to facilitate
35135     // materializing "setb reg".
35136     //
35137     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
35138     // cannot take an immediate as its first operand.
35139     //
35140     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35141         EFLAGS.getValueType().isInteger() &&
35142         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35143       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
35144                                    EFLAGS.getNode()->getVTList(),
35145                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35146       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35147       SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
35148       if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35149         SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35150       return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35151     }
35152   }
35153
35154   if (CC != X86::COND_E && CC != X86::COND_NE)
35155     return SDValue();
35156
35157   SDValue Cmp = Y.getOperand(1);
35158   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
35159       !X86::isZeroNode(Cmp.getOperand(1)) ||
35160       !Cmp.getOperand(0).getValueType().isInteger())
35161     return SDValue();
35162
35163   SDValue Z = Cmp.getOperand(0);
35164   EVT ZVT = Z.getValueType();
35165
35166   // If X is -1 or 0, then we have an opportunity to avoid constants required in
35167   // the general case below.
35168   if (ConstantX) {
35169     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35170     // fake operands:
35171     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35172     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35173     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
35174         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
35175       SDValue Zero = DAG.getConstant(0, DL, ZVT);
35176       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35177       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35178       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35179                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
35180                          SDValue(Neg.getNode(), 1));
35181     }
35182
35183     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35184     // with fake operands:
35185     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35186     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35187     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
35188         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
35189       SDValue One = DAG.getConstant(1, DL, ZVT);
35190       SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35191       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35192                          DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35193     }
35194   }
35195
35196   // (cmp Z, 1) sets the carry flag if Z is 0.
35197   SDValue One = DAG.getConstant(1, DL, ZVT);
35198   SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35199
35200   // Add the flags type for ADC/SBB nodes.
35201   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35202
35203   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35204   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35205   if (CC == X86::COND_NE)
35206     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
35207                        DAG.getConstant(-1ULL, DL, VT), Cmp1);
35208
35209   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
35210   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
35211   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
35212                      DAG.getConstant(0, DL, VT), Cmp1);
35213 }
35214
35215 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35216                                       const X86Subtarget &Subtarget) {
35217   SDValue MulOp = N->getOperand(0);
35218   SDValue Phi = N->getOperand(1);
35219
35220   if (MulOp.getOpcode() != ISD::MUL)
35221     std::swap(MulOp, Phi);
35222   if (MulOp.getOpcode() != ISD::MUL)
35223     return SDValue();
35224
35225   ShrinkMode Mode;
35226   if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
35227     return SDValue();
35228
35229   EVT VT = N->getValueType(0);
35230
35231   unsigned RegSize = 128;
35232   if (Subtarget.hasBWI())
35233     RegSize = 512;
35234   else if (Subtarget.hasAVX2())
35235     RegSize = 256;
35236   unsigned VectorSize = VT.getVectorNumElements() * 16;
35237   // If the vector size is less than 128, or greater than the supported RegSize,
35238   // do not use PMADD.
35239   if (VectorSize < 128 || VectorSize > RegSize)
35240     return SDValue();
35241
35242   SDLoc DL(N);
35243   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35244                                    VT.getVectorNumElements());
35245   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35246                                 VT.getVectorNumElements() / 2);
35247
35248   // Shrink the operands of mul.
35249   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35250   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35251
35252   // Madd vector size is half of the original vector size
35253   SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35254   // Fill the rest of the output with 0
35255   SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35256   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35257   return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35258 }
35259
35260 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35261                                      const X86Subtarget &Subtarget) {
35262   SDLoc DL(N);
35263   EVT VT = N->getValueType(0);
35264   SDValue Op0 = N->getOperand(0);
35265   SDValue Op1 = N->getOperand(1);
35266
35267   // TODO: There's nothing special about i32, any integer type above i16 should
35268   // work just as well.
35269   if (!VT.isVector() || !VT.isSimple() ||
35270       !(VT.getVectorElementType() == MVT::i32))
35271     return SDValue();
35272
35273   unsigned RegSize = 128;
35274   if (Subtarget.hasBWI())
35275     RegSize = 512;
35276   else if (Subtarget.hasAVX2())
35277     RegSize = 256;
35278
35279   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35280   // TODO: We should be able to handle larger vectors by splitting them before
35281   // feeding them into several SADs, and then reducing over those.
35282   if (VT.getSizeInBits() / 4 > RegSize)
35283     return SDValue();
35284
35285   // We know N is a reduction add, which means one of its operands is a phi.
35286   // To match SAD, we need the other operand to be a vector select.
35287   SDValue SelectOp, Phi;
35288   if (Op0.getOpcode() == ISD::VSELECT) {
35289     SelectOp = Op0;
35290     Phi = Op1;
35291   } else if (Op1.getOpcode() == ISD::VSELECT) {
35292     SelectOp = Op1;
35293     Phi = Op0;
35294   } else
35295     return SDValue();
35296
35297   // Check whether we have an abs-diff pattern feeding into the select.
35298   if(!detectZextAbsDiff(SelectOp, Op0, Op1))
35299     return SDValue();
35300
35301   // SAD pattern detected. Now build a SAD instruction and an addition for
35302   // reduction. Note that the number of elements of the result of SAD is less
35303   // than the number of elements of its input. Therefore, we could only update
35304   // part of elements in the reduction vector.
35305   SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35306
35307   // The output of PSADBW is a vector of i64.
35308   // We need to turn the vector of i64 into a vector of i32.
35309   // If the reduction vector is at least as wide as the psadbw result, just
35310   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35311   // anyway.
35312   MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35313   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35314     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35315   else
35316     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35317
35318   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
35319     // Update part of elements of the reduction vector. This is done by first
35320     // extracting a sub-vector from it, updating this sub-vector, and inserting
35321     // it back.
35322     SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
35323                                  DAG.getIntPtrConstant(0, DL));
35324     SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
35325     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
35326                        DAG.getIntPtrConstant(0, DL));
35327   } else
35328     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35329 }
35330
35331 /// Convert vector increment or decrement to sub/add with an all-ones constant:
35332 /// add X, <1, 1...> --> sub X, <-1, -1...>
35333 /// sub X, <1, 1...> --> add X, <-1, -1...>
35334 /// The all-ones vector constant can be materialized using a pcmpeq instruction
35335 /// that is commonly recognized as an idiom (has no register dependency), so
35336 /// that's better/smaller than loading a splat 1 constant.
35337 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35338   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
35339          "Unexpected opcode for increment/decrement transform");
35340
35341   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35342   // out and wait for legalization if we have an unsupported vector length.
35343   EVT VT = N->getValueType(0);
35344   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35345     return SDValue();
35346
35347   SDNode *N1 = N->getOperand(1).getNode();
35348   APInt SplatVal;
35349   if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
35350     return SDValue();
35351
35352   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35353   unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35354   return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35355 }
35356
35357 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35358                           const X86Subtarget &Subtarget) {
35359   const SDNodeFlags Flags = N->getFlags();
35360   if (Flags.hasVectorReduction()) {
35361     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35362       return Sad;
35363     if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
35364       return MAdd;
35365   }
35366   EVT VT = N->getValueType(0);
35367   SDValue Op0 = N->getOperand(0);
35368   SDValue Op1 = N->getOperand(1);
35369
35370   // Try to synthesize horizontal adds from adds of shuffles.
35371   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35372        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35373       isHorizontalBinOp(Op0, Op1, true))
35374     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35375
35376   if (SDValue V = combineIncDecVector(N, DAG))
35377     return V;
35378
35379   return combineAddOrSubToADCOrSBB(N, DAG);
35380 }
35381
35382 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35383                           const X86Subtarget &Subtarget) {
35384   SDValue Op0 = N->getOperand(0);
35385   SDValue Op1 = N->getOperand(1);
35386
35387   // X86 can't encode an immediate LHS of a sub. See if we can push the
35388   // negation into a preceding instruction.
35389   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
35390     // If the RHS of the sub is a XOR with one use and a constant, invert the
35391     // immediate. Then add one to the LHS of the sub so we can turn
35392     // X-Y -> X+~Y+1, saving one register.
35393     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
35394         isa<ConstantSDNode>(Op1.getOperand(1))) {
35395       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35396       EVT VT = Op0.getValueType();
35397       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35398                                    Op1.getOperand(0),
35399                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
35400       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35401                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35402     }
35403   }
35404
35405   // Try to synthesize horizontal subs from subs of shuffles.
35406   EVT VT = N->getValueType(0);
35407   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35408        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35409       isHorizontalBinOp(Op0, Op1, false))
35410     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35411
35412   if (SDValue V = combineIncDecVector(N, DAG))
35413     return V;
35414
35415   return combineAddOrSubToADCOrSBB(N, DAG);
35416 }
35417
35418 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35419                              TargetLowering::DAGCombinerInfo &DCI,
35420                              const X86Subtarget &Subtarget) {
35421   if (DCI.isBeforeLegalize())
35422     return SDValue();
35423
35424   SDLoc DL(N);
35425   unsigned Opcode = N->getOpcode();
35426   MVT VT = N->getSimpleValueType(0);
35427   MVT SVT = VT.getVectorElementType();
35428   unsigned NumElts = VT.getVectorNumElements();
35429   unsigned EltSizeInBits = SVT.getSizeInBits();
35430
35431   SDValue Op = N->getOperand(0);
35432   MVT OpVT = Op.getSimpleValueType();
35433   MVT OpEltVT = OpVT.getVectorElementType();
35434   unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35435   unsigned InputBits = OpEltSizeInBits * NumElts;
35436
35437   // Perform any constant folding.
35438   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35439   APInt UndefElts;
35440   SmallVector<APInt, 64> EltBits;
35441   if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
35442     APInt Undefs(NumElts, 0);
35443     SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35444     bool IsZEXT =
35445         (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35446     for (unsigned i = 0; i != NumElts; ++i) {
35447       if (UndefElts[i]) {
35448         Undefs.setBit(i);
35449         continue;
35450       }
35451       Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
35452                        : EltBits[i].sextOrTrunc(EltSizeInBits);
35453     }
35454     return getConstVector(Vals, Undefs, VT, DAG, DL);
35455   }
35456
35457   // (vzext (bitcast (vzext (x)) -> (vzext x)
35458   // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35459   SDValue V = peekThroughBitcasts(Op);
35460   if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
35461     MVT InnerVT = V.getSimpleValueType();
35462     MVT InnerEltVT = InnerVT.getVectorElementType();
35463
35464     // If the element sizes match exactly, we can just do one larger vzext. This
35465     // is always an exact type match as vzext operates on integer types.
35466     if (OpEltVT == InnerEltVT) {
35467       assert(OpVT == InnerVT && "Types must match for vzext!");
35468       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35469     }
35470
35471     // The only other way we can combine them is if only a single element of the
35472     // inner vzext is used in the input to the outer vzext.
35473     if (InnerEltVT.getSizeInBits() < InputBits)
35474       return SDValue();
35475
35476     // In this case, the inner vzext is completely dead because we're going to
35477     // only look at bits inside of the low element. Just do the outer vzext on
35478     // a bitcast of the input to the inner.
35479     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35480   }
35481
35482   // Check if we can bypass extracting and re-inserting an element of an input
35483   // vector. Essentially:
35484   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35485   // TODO: Add X86ISD::VSEXT support
35486   if (Opcode == X86ISD::VZEXT &&
35487       V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35488       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35489       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
35490     SDValue ExtractedV = V.getOperand(0);
35491     SDValue OrigV = ExtractedV.getOperand(0);
35492     if (isNullConstant(ExtractedV.getOperand(1))) {
35493         MVT OrigVT = OrigV.getSimpleValueType();
35494         // Extract a subvector if necessary...
35495         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
35496           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35497           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35498                                     OrigVT.getVectorNumElements() / Ratio);
35499           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35500                               DAG.getIntPtrConstant(0, DL));
35501         }
35502         Op = DAG.getBitcast(OpVT, OrigV);
35503         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35504       }
35505   }
35506
35507   return SDValue();
35508 }
35509
35510 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35511 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35512                                   const X86Subtarget &Subtarget) {
35513   SDValue Chain = N->getOperand(0);
35514   SDValue LHS = N->getOperand(1);
35515   SDValue RHS = N->getOperand(2);
35516   MVT VT = RHS.getSimpleValueType();
35517   SDLoc DL(N);
35518
35519   auto *C = dyn_cast<ConstantSDNode>(RHS);
35520   if (!C || C->getZExtValue() != 1)
35521     return SDValue();
35522
35523   RHS = DAG.getConstant(-1, DL, VT);
35524   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35525   return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35526                                  DAG.getVTList(MVT::i32, MVT::Other),
35527                                  {Chain, LHS, RHS}, VT, MMO);
35528 }
35529
35530 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
35531 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
35532   SDValue Op0 = N->getOperand(0);
35533   SDValue Op1 = N->getOperand(1);
35534
35535   if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
35536     return SDValue();
35537
35538   EVT VT = N->getValueType(0);
35539   SDLoc DL(N);
35540
35541   return DAG.getNode(X86ISD::TESTM, DL, VT,
35542                      Op0->getOperand(0), Op0->getOperand(1));
35543 }
35544
35545 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35546                                     const X86Subtarget &Subtarget) {
35547   MVT VT = N->getSimpleValueType(0);
35548   SDLoc DL(N);
35549
35550   if (N->getOperand(0) == N->getOperand(1)) {
35551     if (N->getOpcode() == X86ISD::PCMPEQ)
35552       return getOnesVector(VT, DAG, DL);
35553     if (N->getOpcode() == X86ISD::PCMPGT)
35554       return getZeroVector(VT, Subtarget, DAG, DL);
35555   }
35556
35557   return SDValue();
35558 }
35559
35560 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35561                                       TargetLowering::DAGCombinerInfo &DCI,
35562                                       const X86Subtarget &Subtarget) {
35563   if (DCI.isBeforeLegalizeOps())
35564     return SDValue();
35565
35566   SDLoc dl(N);
35567   SDValue Vec = N->getOperand(0);
35568   SDValue SubVec = N->getOperand(1);
35569   SDValue Idx = N->getOperand(2);
35570
35571   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35572   MVT OpVT = N->getSimpleValueType(0);
35573   MVT SubVecVT = SubVec.getSimpleValueType();
35574
35575   // If this is an insert of an extract, combine to a shuffle. Don't do this
35576   // if the insert or extract can be represented with a subvector operation.
35577   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35578       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35579       (IdxVal != 0 || !Vec.isUndef())) {
35580     int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35581     if (ExtIdxVal != 0) {
35582       int VecNumElts = OpVT.getVectorNumElements();
35583       int SubVecNumElts = SubVecVT.getVectorNumElements();
35584       SmallVector<int, 64> Mask(VecNumElts);
35585       // First create an identity shuffle mask.
35586       for (int i = 0; i != VecNumElts; ++i)
35587         Mask[i] = i;
35588       // Now insert the extracted portion.
35589       for (int i = 0; i != SubVecNumElts; ++i)
35590         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35591
35592       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35593     }
35594   }
35595
35596   // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35597   // load:
35598   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35599   //                   (load16 addr + 16), Elts/2)
35600   // --> load32 addr
35601   // or:
35602   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35603   //                   (load32 addr + 32), Elts/2)
35604   // --> load64 addr
35605   // or a 16-byte or 32-byte broadcast:
35606   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35607   //                   (load16 addr), Elts/2)
35608   // --> X86SubVBroadcast(load16 addr)
35609   // or:
35610   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35611   //                   (load32 addr), Elts/2)
35612   // --> X86SubVBroadcast(load32 addr)
35613   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35614       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35615       OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35616     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35617     if (Idx2 && Idx2->getZExtValue() == 0) {
35618       SDValue SubVec2 = Vec.getOperand(1);
35619       // If needed, look through bitcasts to get to the load.
35620       if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35621         bool Fast;
35622         unsigned Alignment = FirstLd->getAlignment();
35623         unsigned AS = FirstLd->getAddressSpace();
35624         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35625         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35626                                     OpVT, AS, Alignment, &Fast) && Fast) {
35627           SDValue Ops[] = {SubVec2, SubVec};
35628           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35629                                                     Subtarget, false))
35630             return Ld;
35631         }
35632       }
35633       // If lower/upper loads are the same and the only users of the load, then
35634       // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35635       if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35636         if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35637             SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35638           return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35639         }
35640       }
35641       // If this is subv_broadcast insert into both halves, use a larger
35642       // subv_broadcast.
35643       if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35644         return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35645                            SubVec.getOperand(0));
35646       }
35647     }
35648   }
35649
35650   return SDValue();
35651 }
35652
35653
35654 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35655                                              DAGCombinerInfo &DCI) const {
35656   SelectionDAG &DAG = DCI.DAG;
35657   switch (N->getOpcode()) {
35658   default: break;
35659   case ISD::EXTRACT_VECTOR_ELT:
35660     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35661   case X86ISD::PEXTRW:
35662   case X86ISD::PEXTRB:
35663     return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35664   case ISD::INSERT_SUBVECTOR:
35665     return combineInsertSubvector(N, DAG, DCI, Subtarget);
35666   case ISD::VSELECT:
35667   case ISD::SELECT:
35668   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35669   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
35670   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
35671   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
35672   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
35673   case X86ISD::ADD:         return combineX86ADD(N, DAG, DCI);
35674   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
35675   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
35676   case ISD::SHL:
35677   case ISD::SRA:
35678   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
35679   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
35680   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
35681   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
35682   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
35683   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
35684   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
35685   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
35686   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
35687   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
35688   case ISD::FADD:
35689   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
35690   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
35691   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
35692   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
35693   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
35694   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
35695   case X86ISD::FXOR:
35696   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
35697   case X86ISD::FMIN:
35698   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
35699   case ISD::FMINNUM:
35700   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
35701   case X86ISD::BT:          return combineBT(N, DAG, DCI);
35702   case ISD::ANY_EXTEND:
35703   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
35704   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
35705   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35706   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
35707   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
35708   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
35709   case X86ISD::VSHLI:
35710   case X86ISD::VSRAI:
35711   case X86ISD::VSRLI:
35712     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35713   case ISD::SIGN_EXTEND_VECTOR_INREG:
35714   case ISD::ZERO_EXTEND_VECTOR_INREG:
35715   case X86ISD::VSEXT:
35716   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
35717   case X86ISD::PINSRB:
35718   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
35719   case X86ISD::SHUFP:       // Handle all target specific shuffles
35720   case X86ISD::INSERTPS:
35721   case X86ISD::EXTRQI:
35722   case X86ISD::INSERTQI:
35723   case X86ISD::PALIGNR:
35724   case X86ISD::VSHLDQ:
35725   case X86ISD::VSRLDQ:
35726   case X86ISD::BLENDI:
35727   case X86ISD::UNPCKH:
35728   case X86ISD::UNPCKL:
35729   case X86ISD::MOVHLPS:
35730   case X86ISD::MOVLHPS:
35731   case X86ISD::PSHUFB:
35732   case X86ISD::PSHUFD:
35733   case X86ISD::PSHUFHW:
35734   case X86ISD::PSHUFLW:
35735   case X86ISD::MOVSHDUP:
35736   case X86ISD::MOVSLDUP:
35737   case X86ISD::MOVDDUP:
35738   case X86ISD::MOVSS:
35739   case X86ISD::MOVSD:
35740   case X86ISD::VPPERM:
35741   case X86ISD::VPERMI:
35742   case X86ISD::VPERMV:
35743   case X86ISD::VPERMV3:
35744   case X86ISD::VPERMIV3:
35745   case X86ISD::VPERMIL2:
35746   case X86ISD::VPERMILPI:
35747   case X86ISD::VPERMILPV:
35748   case X86ISD::VPERM2X128:
35749   case X86ISD::VZEXT_MOVL:
35750   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35751   case X86ISD::FMADD:
35752   case X86ISD::FMADD_RND:
35753   case X86ISD::FMADDS1_RND:
35754   case X86ISD::FMADDS3_RND:
35755   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
35756   case ISD::MGATHER:
35757   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
35758   case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
35759   case X86ISD::TESTM:       return combineTestM(N, DAG);
35760   case X86ISD::PCMPEQ:
35761   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
35762   }
35763
35764   return SDValue();
35765 }
35766
35767 /// Return true if the target has native support for the specified value type
35768 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35769 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35770 /// some i16 instructions are slow.
35771 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35772   if (!isTypeLegal(VT))
35773     return false;
35774   if (VT != MVT::i16)
35775     return true;
35776
35777   switch (Opc) {
35778   default:
35779     return true;
35780   case ISD::LOAD:
35781   case ISD::SIGN_EXTEND:
35782   case ISD::ZERO_EXTEND:
35783   case ISD::ANY_EXTEND:
35784   case ISD::SHL:
35785   case ISD::SRL:
35786   case ISD::SUB:
35787   case ISD::ADD:
35788   case ISD::MUL:
35789   case ISD::AND:
35790   case ISD::OR:
35791   case ISD::XOR:
35792     return false;
35793   }
35794 }
35795
35796 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35797 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35798 /// we don't adjust the stack we clobber the first frame index.
35799 /// See X86InstrInfo::copyPhysReg.
35800 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35801   const MachineRegisterInfo &MRI = MF.getRegInfo();
35802   return any_of(MRI.reg_instructions(X86::EFLAGS),
35803                 [](const MachineInstr &RI) { return RI.isCopy(); });
35804 }
35805
35806 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35807   if (hasCopyImplyingStackAdjustment(MF)) {
35808     MachineFrameInfo &MFI = MF.getFrameInfo();
35809     MFI.setHasCopyImplyingStackAdjustment(true);
35810   }
35811
35812   TargetLoweringBase::finalizeLowering(MF);
35813 }
35814
35815 /// This method query the target whether it is beneficial for dag combiner to
35816 /// promote the specified node. If true, it should return the desired promotion
35817 /// type by reference.
35818 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35819   EVT VT = Op.getValueType();
35820   if (VT != MVT::i16)
35821     return false;
35822
35823   bool Promote = false;
35824   bool Commute = false;
35825   switch (Op.getOpcode()) {
35826   default: break;
35827   case ISD::SIGN_EXTEND:
35828   case ISD::ZERO_EXTEND:
35829   case ISD::ANY_EXTEND:
35830     Promote = true;
35831     break;
35832   case ISD::SHL:
35833   case ISD::SRL: {
35834     SDValue N0 = Op.getOperand(0);
35835     // Look out for (store (shl (load), x)).
35836     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35837       return false;
35838     Promote = true;
35839     break;
35840   }
35841   case ISD::ADD:
35842   case ISD::MUL:
35843   case ISD::AND:
35844   case ISD::OR:
35845   case ISD::XOR:
35846     Commute = true;
35847     LLVM_FALLTHROUGH;
35848   case ISD::SUB: {
35849     SDValue N0 = Op.getOperand(0);
35850     SDValue N1 = Op.getOperand(1);
35851     if (!Commute && MayFoldLoad(N1))
35852       return false;
35853     // Avoid disabling potential load folding opportunities.
35854     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35855       return false;
35856     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35857       return false;
35858     Promote = true;
35859   }
35860   }
35861
35862   PVT = MVT::i32;
35863   return Promote;
35864 }
35865
35866 //===----------------------------------------------------------------------===//
35867 //                           X86 Inline Assembly Support
35868 //===----------------------------------------------------------------------===//
35869
35870 // Helper to match a string separated by whitespace.
35871 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35872   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35873
35874   for (StringRef Piece : Pieces) {
35875     if (!S.startswith(Piece)) // Check if the piece matches.
35876       return false;
35877
35878     S = S.substr(Piece.size());
35879     StringRef::size_type Pos = S.find_first_not_of(" \t");
35880     if (Pos == 0) // We matched a prefix.
35881       return false;
35882
35883     S = S.substr(Pos);
35884   }
35885
35886   return S.empty();
35887 }
35888
35889 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35890
35891   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35892     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35893         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35894         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35895
35896       if (AsmPieces.size() == 3)
35897         return true;
35898       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35899         return true;
35900     }
35901   }
35902   return false;
35903 }
35904
35905 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35906   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35907
35908   const std::string &AsmStr = IA->getAsmString();
35909
35910   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35911   if (!Ty || Ty->getBitWidth() % 16 != 0)
35912     return false;
35913
35914   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35915   SmallVector<StringRef, 4> AsmPieces;
35916   SplitString(AsmStr, AsmPieces, ";\n");
35917
35918   switch (AsmPieces.size()) {
35919   default: return false;
35920   case 1:
35921     // FIXME: this should verify that we are targeting a 486 or better.  If not,
35922     // we will turn this bswap into something that will be lowered to logical
35923     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
35924     // lower so don't worry about this.
35925     // bswap $0
35926     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35927         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35928         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35929         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35930         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35931         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35932       // No need to check constraints, nothing other than the equivalent of
35933       // "=r,0" would be valid here.
35934       return IntrinsicLowering::LowerToByteSwap(CI);
35935     }
35936
35937     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
35938     if (CI->getType()->isIntegerTy(16) &&
35939         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35940         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35941          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35942       AsmPieces.clear();
35943       StringRef ConstraintsStr = IA->getConstraintString();
35944       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35945       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35946       if (clobbersFlagRegisters(AsmPieces))
35947         return IntrinsicLowering::LowerToByteSwap(CI);
35948     }
35949     break;
35950   case 3:
35951     if (CI->getType()->isIntegerTy(32) &&
35952         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35953         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35954         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35955         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35956       AsmPieces.clear();
35957       StringRef ConstraintsStr = IA->getConstraintString();
35958       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35959       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35960       if (clobbersFlagRegisters(AsmPieces))
35961         return IntrinsicLowering::LowerToByteSwap(CI);
35962     }
35963
35964     if (CI->getType()->isIntegerTy(64)) {
35965       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35966       if (Constraints.size() >= 2 &&
35967           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35968           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35969         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
35970         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35971             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35972             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35973           return IntrinsicLowering::LowerToByteSwap(CI);
35974       }
35975     }
35976     break;
35977   }
35978   return false;
35979 }
35980
35981 /// Given a constraint letter, return the type of constraint for this target.
35982 X86TargetLowering::ConstraintType
35983 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35984   if (Constraint.size() == 1) {
35985     switch (Constraint[0]) {
35986     case 'R':
35987     case 'q':
35988     case 'Q':
35989     case 'f':
35990     case 't':
35991     case 'u':
35992     case 'y':
35993     case 'x':
35994     case 'v':
35995     case 'Y':
35996     case 'l':
35997       return C_RegisterClass;
35998     case 'k': // AVX512 masking registers.
35999     case 'a':
36000     case 'b':
36001     case 'c':
36002     case 'd':
36003     case 'S':
36004     case 'D':
36005     case 'A':
36006       return C_Register;
36007     case 'I':
36008     case 'J':
36009     case 'K':
36010     case 'L':
36011     case 'M':
36012     case 'N':
36013     case 'G':
36014     case 'C':
36015     case 'e':
36016     case 'Z':
36017       return C_Other;
36018     default:
36019       break;
36020     }
36021   }
36022   else if (Constraint.size() == 2) {
36023     switch (Constraint[0]) {
36024     default:
36025       break;
36026     case 'Y':
36027       switch (Constraint[1]) {
36028       default:
36029         break;
36030       case 'k':
36031         return C_Register;
36032       }
36033     }
36034   }
36035   return TargetLowering::getConstraintType(Constraint);
36036 }
36037
36038 /// Examine constraint type and operand type and determine a weight value.
36039 /// This object must already have been set up with the operand type
36040 /// and the current alternative constraint selected.
36041 TargetLowering::ConstraintWeight
36042   X86TargetLowering::getSingleConstraintMatchWeight(
36043     AsmOperandInfo &info, const char *constraint) const {
36044   ConstraintWeight weight = CW_Invalid;
36045   Value *CallOperandVal = info.CallOperandVal;
36046     // If we don't have a value, we can't do a match,
36047     // but allow it at the lowest weight.
36048   if (!CallOperandVal)
36049     return CW_Default;
36050   Type *type = CallOperandVal->getType();
36051   // Look at the constraint type.
36052   switch (*constraint) {
36053   default:
36054     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
36055     LLVM_FALLTHROUGH;
36056   case 'R':
36057   case 'q':
36058   case 'Q':
36059   case 'a':
36060   case 'b':
36061   case 'c':
36062   case 'd':
36063   case 'S':
36064   case 'D':
36065   case 'A':
36066     if (CallOperandVal->getType()->isIntegerTy())
36067       weight = CW_SpecificReg;
36068     break;
36069   case 'f':
36070   case 't':
36071   case 'u':
36072     if (type->isFloatingPointTy())
36073       weight = CW_SpecificReg;
36074     break;
36075   case 'y':
36076     if (type->isX86_MMXTy() && Subtarget.hasMMX())
36077       weight = CW_SpecificReg;
36078     break;
36079   case 'Y':
36080     // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
36081     if (constraint[1] == 'k') {
36082       // Support for 'Yk' (similarly to the 'k' variant below).
36083       weight = CW_SpecificReg;
36084       break;
36085     }
36086   // Else fall through (handle "Y" constraint).
36087     LLVM_FALLTHROUGH;
36088   case 'v':
36089     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
36090       weight = CW_Register;
36091     LLVM_FALLTHROUGH;
36092   case 'x':
36093     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
36094         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
36095       weight = CW_Register;
36096     break;
36097   case 'k':
36098     // Enable conditional vector operations using %k<#> registers.
36099     weight = CW_SpecificReg;
36100     break;
36101   case 'I':
36102     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
36103       if (C->getZExtValue() <= 31)
36104         weight = CW_Constant;
36105     }
36106     break;
36107   case 'J':
36108     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36109       if (C->getZExtValue() <= 63)
36110         weight = CW_Constant;
36111     }
36112     break;
36113   case 'K':
36114     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36115       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
36116         weight = CW_Constant;
36117     }
36118     break;
36119   case 'L':
36120     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36121       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
36122         weight = CW_Constant;
36123     }
36124     break;
36125   case 'M':
36126     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36127       if (C->getZExtValue() <= 3)
36128         weight = CW_Constant;
36129     }
36130     break;
36131   case 'N':
36132     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36133       if (C->getZExtValue() <= 0xff)
36134         weight = CW_Constant;
36135     }
36136     break;
36137   case 'G':
36138   case 'C':
36139     if (isa<ConstantFP>(CallOperandVal)) {
36140       weight = CW_Constant;
36141     }
36142     break;
36143   case 'e':
36144     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36145       if ((C->getSExtValue() >= -0x80000000LL) &&
36146           (C->getSExtValue() <= 0x7fffffffLL))
36147         weight = CW_Constant;
36148     }
36149     break;
36150   case 'Z':
36151     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36152       if (C->getZExtValue() <= 0xffffffff)
36153         weight = CW_Constant;
36154     }
36155     break;
36156   }
36157   return weight;
36158 }
36159
36160 /// Try to replace an X constraint, which matches anything, with another that
36161 /// has more specific requirements based on the type of the corresponding
36162 /// operand.
36163 const char *X86TargetLowering::
36164 LowerXConstraint(EVT ConstraintVT) const {
36165   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
36166   // 'f' like normal targets.
36167   if (ConstraintVT.isFloatingPoint()) {
36168     if (Subtarget.hasSSE2())
36169       return "Y";
36170     if (Subtarget.hasSSE1())
36171       return "x";
36172   }
36173
36174   return TargetLowering::LowerXConstraint(ConstraintVT);
36175 }
36176
36177 /// Lower the specified operand into the Ops vector.
36178 /// If it is invalid, don't add anything to Ops.
36179 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36180                                                      std::string &Constraint,
36181                                                      std::vector<SDValue>&Ops,
36182                                                      SelectionDAG &DAG) const {
36183   SDValue Result;
36184
36185   // Only support length 1 constraints for now.
36186   if (Constraint.length() > 1) return;
36187
36188   char ConstraintLetter = Constraint[0];
36189   switch (ConstraintLetter) {
36190   default: break;
36191   case 'I':
36192     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36193       if (C->getZExtValue() <= 31) {
36194         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36195                                        Op.getValueType());
36196         break;
36197       }
36198     }
36199     return;
36200   case 'J':
36201     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36202       if (C->getZExtValue() <= 63) {
36203         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36204                                        Op.getValueType());
36205         break;
36206       }
36207     }
36208     return;
36209   case 'K':
36210     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36211       if (isInt<8>(C->getSExtValue())) {
36212         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36213                                        Op.getValueType());
36214         break;
36215       }
36216     }
36217     return;
36218   case 'L':
36219     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36220       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
36221           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
36222         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36223                                        Op.getValueType());
36224         break;
36225       }
36226     }
36227     return;
36228   case 'M':
36229     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36230       if (C->getZExtValue() <= 3) {
36231         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36232                                        Op.getValueType());
36233         break;
36234       }
36235     }
36236     return;
36237   case 'N':
36238     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36239       if (C->getZExtValue() <= 255) {
36240         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36241                                        Op.getValueType());
36242         break;
36243       }
36244     }
36245     return;
36246   case 'O':
36247     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36248       if (C->getZExtValue() <= 127) {
36249         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36250                                        Op.getValueType());
36251         break;
36252       }
36253     }
36254     return;
36255   case 'e': {
36256     // 32-bit signed value
36257     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36258       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36259                                            C->getSExtValue())) {
36260         // Widen to 64 bits here to get it sign extended.
36261         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36262         break;
36263       }
36264     // FIXME gcc accepts some relocatable values here too, but only in certain
36265     // memory models; it's complicated.
36266     }
36267     return;
36268   }
36269   case 'Z': {
36270     // 32-bit unsigned value
36271     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36272       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36273                                            C->getZExtValue())) {
36274         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36275                                        Op.getValueType());
36276         break;
36277       }
36278     }
36279     // FIXME gcc accepts some relocatable values here too, but only in certain
36280     // memory models; it's complicated.
36281     return;
36282   }
36283   case 'i': {
36284     // Literal immediates are always ok.
36285     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
36286       // Widen to 64 bits here to get it sign extended.
36287       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36288       break;
36289     }
36290
36291     // In any sort of PIC mode addresses need to be computed at runtime by
36292     // adding in a register or some sort of table lookup.  These can't
36293     // be used as immediates.
36294     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
36295       return;
36296
36297     // If we are in non-pic codegen mode, we allow the address of a global (with
36298     // an optional displacement) to be used with 'i'.
36299     GlobalAddressSDNode *GA = nullptr;
36300     int64_t Offset = 0;
36301
36302     // Match either (GA), (GA+C), (GA+C1+C2), etc.
36303     while (1) {
36304       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
36305         Offset += GA->getOffset();
36306         break;
36307       } else if (Op.getOpcode() == ISD::ADD) {
36308         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36309           Offset += C->getZExtValue();
36310           Op = Op.getOperand(0);
36311           continue;
36312         }
36313       } else if (Op.getOpcode() == ISD::SUB) {
36314         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36315           Offset += -C->getZExtValue();
36316           Op = Op.getOperand(0);
36317           continue;
36318         }
36319       }
36320
36321       // Otherwise, this isn't something we can handle, reject it.
36322       return;
36323     }
36324
36325     const GlobalValue *GV = GA->getGlobal();
36326     // If we require an extra load to get this address, as in PIC mode, we
36327     // can't accept it.
36328     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36329       return;
36330
36331     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36332                                         GA->getValueType(0), Offset);
36333     break;
36334   }
36335   }
36336
36337   if (Result.getNode()) {
36338     Ops.push_back(Result);
36339     return;
36340   }
36341   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36342 }
36343
36344 /// Check if \p RC is a general purpose register class.
36345 /// I.e., GR* or one of their variant.
36346 static bool isGRClass(const TargetRegisterClass &RC) {
36347   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
36348          RC.hasSuperClassEq(&X86::GR16RegClass) ||
36349          RC.hasSuperClassEq(&X86::GR32RegClass) ||
36350          RC.hasSuperClassEq(&X86::GR64RegClass) ||
36351          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36352 }
36353
36354 /// Check if \p RC is a vector register class.
36355 /// I.e., FR* / VR* or one of their variant.
36356 static bool isFRClass(const TargetRegisterClass &RC) {
36357   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
36358          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
36359          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
36360          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
36361          RC.hasSuperClassEq(&X86::VR512RegClass);
36362 }
36363
36364 std::pair<unsigned, const TargetRegisterClass *>
36365 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36366                                                 StringRef Constraint,
36367                                                 MVT VT) const {
36368   // First, see if this is a constraint that directly corresponds to an LLVM
36369   // register class.
36370   if (Constraint.size() == 1) {
36371     // GCC Constraint Letters
36372     switch (Constraint[0]) {
36373     default: break;
36374       // TODO: Slight differences here in allocation order and leaving
36375       // RIP in the class. Do they matter any more here than they do
36376       // in the normal allocation?
36377     case 'k':
36378       if (Subtarget.hasAVX512()) {
36379         //  Only supported in AVX512 or later.
36380         switch (VT.SimpleTy) {
36381         default: break;
36382         case MVT::i32:
36383           return std::make_pair(0U, &X86::VK32RegClass);
36384         case MVT::i16:
36385           return std::make_pair(0U, &X86::VK16RegClass);
36386         case MVT::i8:
36387           return std::make_pair(0U, &X86::VK8RegClass);
36388         case MVT::i1:
36389           return std::make_pair(0U, &X86::VK1RegClass);
36390         case MVT::i64:
36391           return std::make_pair(0U, &X86::VK64RegClass);
36392         }
36393       }
36394       break;
36395     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36396       if (Subtarget.is64Bit()) {
36397         if (VT == MVT::i32 || VT == MVT::f32)
36398           return std::make_pair(0U, &X86::GR32RegClass);
36399         if (VT == MVT::i16)
36400           return std::make_pair(0U, &X86::GR16RegClass);
36401         if (VT == MVT::i8 || VT == MVT::i1)
36402           return std::make_pair(0U, &X86::GR8RegClass);
36403         if (VT == MVT::i64 || VT == MVT::f64)
36404           return std::make_pair(0U, &X86::GR64RegClass);
36405         break;
36406       }
36407       LLVM_FALLTHROUGH;
36408       // 32-bit fallthrough
36409     case 'Q':   // Q_REGS
36410       if (VT == MVT::i32 || VT == MVT::f32)
36411         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36412       if (VT == MVT::i16)
36413         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36414       if (VT == MVT::i8 || VT == MVT::i1)
36415         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36416       if (VT == MVT::i64)
36417         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36418       break;
36419     case 'r':   // GENERAL_REGS
36420     case 'l':   // INDEX_REGS
36421       if (VT == MVT::i8 || VT == MVT::i1)
36422         return std::make_pair(0U, &X86::GR8RegClass);
36423       if (VT == MVT::i16)
36424         return std::make_pair(0U, &X86::GR16RegClass);
36425       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
36426         return std::make_pair(0U, &X86::GR32RegClass);
36427       return std::make_pair(0U, &X86::GR64RegClass);
36428     case 'R':   // LEGACY_REGS
36429       if (VT == MVT::i8 || VT == MVT::i1)
36430         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36431       if (VT == MVT::i16)
36432         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36433       if (VT == MVT::i32 || !Subtarget.is64Bit())
36434         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36435       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36436     case 'f':  // FP Stack registers.
36437       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
36438       // value to the correct fpstack register class.
36439       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
36440         return std::make_pair(0U, &X86::RFP32RegClass);
36441       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
36442         return std::make_pair(0U, &X86::RFP64RegClass);
36443       return std::make_pair(0U, &X86::RFP80RegClass);
36444     case 'y':   // MMX_REGS if MMX allowed.
36445       if (!Subtarget.hasMMX()) break;
36446       return std::make_pair(0U, &X86::VR64RegClass);
36447     case 'Y':   // SSE_REGS if SSE2 allowed
36448       if (!Subtarget.hasSSE2()) break;
36449       LLVM_FALLTHROUGH;
36450     case 'v':
36451     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36452       if (!Subtarget.hasSSE1()) break;
36453       bool VConstraint = (Constraint[0] == 'v');
36454
36455       switch (VT.SimpleTy) {
36456       default: break;
36457       // Scalar SSE types.
36458       case MVT::f32:
36459       case MVT::i32:
36460         if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
36461           return std::make_pair(0U, &X86::FR32XRegClass);
36462         return std::make_pair(0U, &X86::FR32RegClass);
36463       case MVT::f64:
36464       case MVT::i64:
36465         if (VConstraint && Subtarget.hasVLX())
36466           return std::make_pair(0U, &X86::FR64XRegClass);
36467         return std::make_pair(0U, &X86::FR64RegClass);
36468       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36469       // Vector types.
36470       case MVT::v16i8:
36471       case MVT::v8i16:
36472       case MVT::v4i32:
36473       case MVT::v2i64:
36474       case MVT::v4f32:
36475       case MVT::v2f64:
36476         if (VConstraint && Subtarget.hasVLX())
36477           return std::make_pair(0U, &X86::VR128XRegClass);
36478         return std::make_pair(0U, &X86::VR128RegClass);
36479       // AVX types.
36480       case MVT::v32i8:
36481       case MVT::v16i16:
36482       case MVT::v8i32:
36483       case MVT::v4i64:
36484       case MVT::v8f32:
36485       case MVT::v4f64:
36486         if (VConstraint && Subtarget.hasVLX())
36487           return std::make_pair(0U, &X86::VR256XRegClass);
36488         return std::make_pair(0U, &X86::VR256RegClass);
36489       case MVT::v8f64:
36490       case MVT::v16f32:
36491       case MVT::v16i32:
36492       case MVT::v8i64:
36493         return std::make_pair(0U, &X86::VR512RegClass);
36494       }
36495       break;
36496     }
36497   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
36498     switch (Constraint[1]) {
36499     default:
36500       break;
36501     case 'k':
36502       // This register class doesn't allocate k0 for masked vector operation.
36503       if (Subtarget.hasAVX512()) { // Only supported in AVX512.
36504         switch (VT.SimpleTy) {
36505         default: break;
36506         case MVT::i32:
36507           return std::make_pair(0U, &X86::VK32WMRegClass);
36508         case MVT::i16:
36509           return std::make_pair(0U, &X86::VK16WMRegClass);
36510         case MVT::i8:
36511           return std::make_pair(0U, &X86::VK8WMRegClass);
36512         case MVT::i1:
36513           return std::make_pair(0U, &X86::VK1WMRegClass);
36514         case MVT::i64:
36515           return std::make_pair(0U, &X86::VK64WMRegClass);
36516         }
36517       }
36518       break;
36519     }
36520   }
36521
36522   // Use the default implementation in TargetLowering to convert the register
36523   // constraint into a member of a register class.
36524   std::pair<unsigned, const TargetRegisterClass*> Res;
36525   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36526
36527   // Not found as a standard register?
36528   if (!Res.second) {
36529     // Map st(0) -> st(7) -> ST0
36530     if (Constraint.size() == 7 && Constraint[0] == '{' &&
36531         tolower(Constraint[1]) == 's' &&
36532         tolower(Constraint[2]) == 't' &&
36533         Constraint[3] == '(' &&
36534         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
36535         Constraint[5] == ')' &&
36536         Constraint[6] == '}') {
36537
36538       Res.first = X86::FP0+Constraint[4]-'0';
36539       Res.second = &X86::RFP80RegClass;
36540       return Res;
36541     }
36542
36543     // GCC allows "st(0)" to be called just plain "st".
36544     if (StringRef("{st}").equals_lower(Constraint)) {
36545       Res.first = X86::FP0;
36546       Res.second = &X86::RFP80RegClass;
36547       return Res;
36548     }
36549
36550     // flags -> EFLAGS
36551     if (StringRef("{flags}").equals_lower(Constraint)) {
36552       Res.first = X86::EFLAGS;
36553       Res.second = &X86::CCRRegClass;
36554       return Res;
36555     }
36556
36557     // 'A' means [ER]AX + [ER]DX.
36558     if (Constraint == "A") {
36559       if (Subtarget.is64Bit()) {
36560         Res.first = X86::RAX;
36561         Res.second = &X86::GR64_ADRegClass;
36562       } else {
36563         assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36564                "Expecting 64, 32 or 16 bit subtarget");
36565         Res.first = X86::EAX;
36566         Res.second = &X86::GR32_ADRegClass;
36567       }
36568       return Res;
36569     }
36570     return Res;
36571   }
36572
36573   // Otherwise, check to see if this is a register class of the wrong value
36574   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36575   // turn into {ax},{dx}.
36576   // MVT::Other is used to specify clobber names.
36577   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36578     return Res;   // Correct type already, nothing to do.
36579
36580   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36581   // return "eax". This should even work for things like getting 64bit integer
36582   // registers when given an f64 type.
36583   const TargetRegisterClass *Class = Res.second;
36584   // The generic code will match the first register class that contains the
36585   // given register. Thus, based on the ordering of the tablegened file,
36586   // the "plain" GR classes might not come first.
36587   // Therefore, use a helper method.
36588   if (isGRClass(*Class)) {
36589     unsigned Size = VT.getSizeInBits();
36590     if (Size == 1) Size = 8;
36591     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36592     if (DestReg > 0) {
36593       Res.first = DestReg;
36594       Res.second = Size == 8 ? &X86::GR8RegClass
36595                  : Size == 16 ? &X86::GR16RegClass
36596                  : Size == 32 ? &X86::GR32RegClass
36597                  : &X86::GR64RegClass;
36598       assert(Res.second->contains(Res.first) && "Register in register class");
36599     } else {
36600       // No register found/type mismatch.
36601       Res.first = 0;
36602       Res.second = nullptr;
36603     }
36604   } else if (isFRClass(*Class)) {
36605     // Handle references to XMM physical registers that got mapped into the
36606     // wrong class.  This can happen with constraints like {xmm0} where the
36607     // target independent register mapper will just pick the first match it can
36608     // find, ignoring the required type.
36609
36610     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36611     if (VT == MVT::f32 || VT == MVT::i32)
36612       Res.second = &X86::FR32RegClass;
36613     else if (VT == MVT::f64 || VT == MVT::i64)
36614       Res.second = &X86::FR64RegClass;
36615     else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36616       Res.second = &X86::VR128RegClass;
36617     else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36618       Res.second = &X86::VR256RegClass;
36619     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36620       Res.second = &X86::VR512RegClass;
36621     else {
36622       // Type mismatch and not a clobber: Return an error;
36623       Res.first = 0;
36624       Res.second = nullptr;
36625     }
36626   }
36627
36628   return Res;
36629 }
36630
36631 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36632                                             const AddrMode &AM, Type *Ty,
36633                                             unsigned AS) const {
36634   // Scaling factors are not free at all.
36635   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36636   // will take 2 allocations in the out of order engine instead of 1
36637   // for plain addressing mode, i.e. inst (reg1).
36638   // E.g.,
36639   // vaddps (%rsi,%drx), %ymm0, %ymm1
36640   // Requires two allocations (one for the load, one for the computation)
36641   // whereas:
36642   // vaddps (%rsi), %ymm0, %ymm1
36643   // Requires just 1 allocation, i.e., freeing allocations for other operations
36644   // and having less micro operations to execute.
36645   //
36646   // For some X86 architectures, this is even worse because for instance for
36647   // stores, the complex addressing mode forces the instruction to use the
36648   // "load" ports instead of the dedicated "store" port.
36649   // E.g., on Haswell:
36650   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36651   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36652   if (isLegalAddressingMode(DL, AM, Ty, AS))
36653     // Scale represents reg2 * scale, thus account for 1
36654     // as soon as we use a second register.
36655     return AM.Scale != 0;
36656   return -1;
36657 }
36658
36659 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36660   // Integer division on x86 is expensive. However, when aggressively optimizing
36661   // for code size, we prefer to use a div instruction, as it is usually smaller
36662   // than the alternative sequence.
36663   // The exception to this is vector division. Since x86 doesn't have vector
36664   // integer division, leaving the division as-is is a loss even in terms of
36665   // size, because it will have to be scalarized, while the alternative code
36666   // sequence can be performed in vector form.
36667   bool OptSize =
36668       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36669   return OptSize && !VT.isVector();
36670 }
36671
36672 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36673   if (!Subtarget.is64Bit())
36674     return;
36675
36676   // Update IsSplitCSR in X86MachineFunctionInfo.
36677   X86MachineFunctionInfo *AFI =
36678     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36679   AFI->setIsSplitCSR(true);
36680 }
36681
36682 void X86TargetLowering::insertCopiesSplitCSR(
36683     MachineBasicBlock *Entry,
36684     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36685   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36686   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36687   if (!IStart)
36688     return;
36689
36690   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36691   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36692   MachineBasicBlock::iterator MBBI = Entry->begin();
36693   for (const MCPhysReg *I = IStart; *I; ++I) {
36694     const TargetRegisterClass *RC = nullptr;
36695     if (X86::GR64RegClass.contains(*I))
36696       RC = &X86::GR64RegClass;
36697     else
36698       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36699
36700     unsigned NewVR = MRI->createVirtualRegister(RC);
36701     // Create copy from CSR to a virtual register.
36702     // FIXME: this currently does not emit CFI pseudo-instructions, it works
36703     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36704     // nounwind. If we want to generalize this later, we may need to emit
36705     // CFI pseudo-instructions.
36706     assert(Entry->getParent()->getFunction()->hasFnAttribute(
36707                Attribute::NoUnwind) &&
36708            "Function should be nounwind in insertCopiesSplitCSR!");
36709     Entry->addLiveIn(*I);
36710     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36711         .addReg(*I);
36712
36713     // Insert the copy-back instructions right before the terminator.
36714     for (auto *Exit : Exits)
36715       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36716               TII->get(TargetOpcode::COPY), *I)
36717           .addReg(NewVR);
36718   }
36719 }
36720
36721 bool X86TargetLowering::supportSwiftError() const {
36722   return Subtarget.is64Bit();
36723 }
36724
36725 /// Returns the name of the symbol used to emit stack probes or the empty
36726 /// string if not applicable.
36727 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
36728   // If the function specifically requests stack probes, emit them.
36729   if (MF.getFunction()->hasFnAttribute("probe-stack"))
36730     return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
36731
36732   // Generally, if we aren't on Windows, the platform ABI does not include
36733   // support for stack probes, so don't emit them.
36734   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
36735     return "";
36736
36737   // We need a stack probe to conform to the Windows ABI. Choose the right
36738   // symbol.
36739   if (Subtarget.is64Bit())
36740     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
36741   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
36742 }