contrib/llvm/lib/Target/X86/X86ISelLowering.cpp

   1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file defines the interfaces that X86 uses to lower LLVM code into a
  11 // selection DAG.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "X86ISelLowering.h"
  16 #include "Utils/X86ShuffleDecode.h"
  17 #include "X86CallingConv.h"
  18 #include "X86FrameLowering.h"
  19 #include "X86InstrBuilder.h"
  20 #include "X86IntrinsicsInfo.h"
  21 #include "X86MachineFunctionInfo.h"
  22 #include "X86ShuffleDecodeConstantPool.h"
  23 #include "X86TargetMachine.h"
  24 #include "X86TargetObjectFile.h"
  25 #include "llvm/ADT/SmallBitVector.h"
  26 #include "llvm/ADT/SmallSet.h"
  27 #include "llvm/ADT/Statistic.h"
  28 #include "llvm/ADT/StringExtras.h"
  29 #include "llvm/ADT/StringSwitch.h"
  30 #include "llvm/Analysis/EHPersonalities.h"
  31 #include "llvm/CodeGen/IntrinsicLowering.h"
  32 #include "llvm/CodeGen/MachineFrameInfo.h"
  33 #include "llvm/CodeGen/MachineFunction.h"
  34 #include "llvm/CodeGen/MachineInstrBuilder.h"
  35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
  36 #include "llvm/CodeGen/MachineModuleInfo.h"
  37 #include "llvm/CodeGen/MachineRegisterInfo.h"
  38 #include "llvm/CodeGen/WinEHFuncInfo.h"
  39 #include "llvm/IR/CallSite.h"
  40 #include "llvm/IR/CallingConv.h"
  41 #include "llvm/IR/Constants.h"
  42 #include "llvm/IR/DerivedTypes.h"
  43 #include "llvm/IR/DiagnosticInfo.h"
  44 #include "llvm/IR/Function.h"
  45 #include "llvm/IR/GlobalAlias.h"
  46 #include "llvm/IR/GlobalVariable.h"
  47 #include "llvm/IR/Instructions.h"
  48 #include "llvm/IR/Intrinsics.h"
  49 #include "llvm/MC/MCAsmInfo.h"
  50 #include "llvm/MC/MCContext.h"
  51 #include "llvm/MC/MCExpr.h"
  52 #include "llvm/MC/MCSymbol.h"
  53 #include "llvm/Support/CommandLine.h"
  54 #include "llvm/Support/Debug.h"
  55 #include "llvm/Support/ErrorHandling.h"
  56 #include "llvm/Support/KnownBits.h"
  57 #include "llvm/Support/MathExtras.h"
  58 #include "llvm/Target/TargetLowering.h"
  59 #include "llvm/Target/TargetOptions.h"
  60 #include <algorithm>
  61 #include <bitset>
  62 #include <cctype>
  63 #include <numeric>
  64 using namespace llvm;
  65
  66 #define DEBUG_TYPE "x86-isel"
  67
  68 STATISTIC(NumTailCalls, "Number of tail calls");
  69
  70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
  71     "x86-experimental-vector-widening-legalization", cl::init(false),
  72     cl::desc("Enable an experimental vector type legalization through widening "
  73              "rather than promotion."),
  74     cl::Hidden);
  75
  76 static cl::opt<int> ExperimentalPrefLoopAlignment(
  77     "x86-experimental-pref-loop-alignment", cl::init(4),
  78     cl::desc("Sets the preferable loop alignment for experiments "
  79              "(the last x86-experimental-pref-loop-alignment bits"
  80              " of the loop header PC will be 0)."),
  81     cl::Hidden);
  82
  83 static cl::opt<bool> MulConstantOptimization(
  84     "mul-constant-optimization", cl::init(true),
  85     cl::desc("Replace 'mul x, Const' with more effective instructions like "
  86              "SHIFT, LEA, etc."),
  87     cl::Hidden);
  88
  89 /// Call this when the user attempts to do something unsupported, like
  90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
  91 /// report_fatal_error, so calling code should attempt to recover without
  92 /// crashing.
  93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
  94                              const char *Msg) {
  95   MachineFunction &MF = DAG.getMachineFunction();
  96   DAG.getContext()->diagnose(
  97       DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
  98 }
  99
 100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 101                                      const X86Subtarget &STI)
 102     : TargetLowering(TM), Subtarget(STI) {
 103   bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
 104   X86ScalarSSEf64 = Subtarget.hasSSE2();
 105   X86ScalarSSEf32 = Subtarget.hasSSE1();
 106   MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
 107
 108   // Set up the TargetLowering object.
 109
 110   // X86 is weird. It always uses i8 for shift amounts and setcc results.
 111   setBooleanContents(ZeroOrOneBooleanContent);
 112   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
 113   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 114
 115   // For 64-bit, since we have so many registers, use the ILP scheduler.
 116   // For 32-bit, use the register pressure specific scheduling.
 117   // For Atom, always use ILP scheduling.
 118   if (Subtarget.isAtom())
 119     setSchedulingPreference(Sched::ILP);
 120   else if (Subtarget.is64Bit())
 121     setSchedulingPreference(Sched::ILP);
 122   else
 123     setSchedulingPreference(Sched::RegPressure);
 124   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 125   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 126
 127   // Bypass expensive divides and use cheaper ones.
 128   if (TM.getOptLevel() >= CodeGenOpt::Default) {
 129     if (Subtarget.hasSlowDivide32())
 130       addBypassSlowDiv(32, 8);
 131     if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
 132       addBypassSlowDiv(64, 32);
 133   }
 134
 135   if (Subtarget.isTargetKnownWindowsMSVC() ||
 136       Subtarget.isTargetWindowsItanium()) {
 137     // Setup Windows compiler runtime calls.
 138     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
 139     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
 140     setLibcallName(RTLIB::SREM_I64, "_allrem");
 141     setLibcallName(RTLIB::UREM_I64, "_aullrem");
 142     setLibcallName(RTLIB::MUL_I64, "_allmul");
 143     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
 144     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
 145     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
 146     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
 147     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
 148   }
 149
 150   if (Subtarget.isTargetDarwin()) {
 151     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
 152     setUseUnderscoreSetJmp(false);
 153     setUseUnderscoreLongJmp(false);
 154   } else if (Subtarget.isTargetWindowsGNU()) {
 155     // MS runtime is weird: it exports _setjmp, but longjmp!
 156     setUseUnderscoreSetJmp(true);
 157     setUseUnderscoreLongJmp(false);
 158   } else {
 159     setUseUnderscoreSetJmp(true);
 160     setUseUnderscoreLongJmp(true);
 161   }
 162
 163   // Set up the register classes.
 164   addRegisterClass(MVT::i8, &X86::GR8RegClass);
 165   addRegisterClass(MVT::i16, &X86::GR16RegClass);
 166   addRegisterClass(MVT::i32, &X86::GR32RegClass);
 167   if (Subtarget.is64Bit())
 168     addRegisterClass(MVT::i64, &X86::GR64RegClass);
 169
 170   for (MVT VT : MVT::integer_valuetypes())
 171     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 172
 173   // We don't accept any truncstore of integer registers.
 174   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
 175   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
 176   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
 177   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
 178   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
 179   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
 180
 181   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 182
 183   // SETOEQ and SETUNE require checking two conditions.
 184   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
 185   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
 186   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
 187   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
 188   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
 189   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
 190
 191   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
 192   // operation.
 193   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
 194   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
 195   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
 196
 197   if (Subtarget.is64Bit()) {
 198     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
 199       // f32/f64 are legal, f80 is custom.
 200       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
 201     else
 202       setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
 203     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 204   } else if (!Subtarget.useSoftFloat()) {
 205     // We have an algorithm for SSE2->double, and we turn this into a
 206     // 64-bit FILD followed by conditional FADD for other targets.
 207     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
 208     // We have an algorithm for SSE2, and we turn this into a 64-bit
 209     // FILD or VCVTUSI2SS/SD for other targets.
 210     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
 211   }
 212
 213   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
 214   // this operation.
 215   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
 216   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
 217
 218   if (!Subtarget.useSoftFloat()) {
 219     // SSE has no i16 to fp conversion, only i32.
 220     if (X86ScalarSSEf32) {
 221       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 222       // f32 and f64 cases are Legal, f80 case is not
 223       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 224     } else {
 225       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
 226       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
 227     }
 228   } else {
 229     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
 230     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
 231   }
 232
 233   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
 234   // this operation.
 235   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
 236   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
 237
 238   if (!Subtarget.useSoftFloat()) {
 239     // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
 240     // are Legal, f80 is custom lowered.
 241     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
 242     setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
 243
 244     if (X86ScalarSSEf32) {
 245       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 246       // f32 and f64 cases are Legal, f80 case is not
 247       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 248     } else {
 249       setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
 250       setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
 251     }
 252   } else {
 253     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
 254     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Expand);
 255     setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Expand);
 256   }
 257
 258   // Handle FP_TO_UINT by promoting the destination to a larger signed
 259   // conversion.
 260   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
 261   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
 262   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
 263
 264   if (Subtarget.is64Bit()) {
 265     if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
 266       // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
 267       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 268       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Custom);
 269     } else {
 270       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
 271       setOperationAction(ISD::FP_TO_UINT   , MVT::i64  , Expand);
 272     }
 273   } else if (!Subtarget.useSoftFloat()) {
 274     // Since AVX is a superset of SSE3, only check for SSE here.
 275     if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
 276       // Expand FP_TO_UINT into a select.
 277       // FIXME: We would like to use a Custom expander here eventually to do
 278       // the optimal thing for SSE vs. the default expansion in the legalizer.
 279       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
 280     else
 281       // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
 282       // With SSE3 we can use fisttpll to convert to a signed i64; without
 283       // SSE, we're stuck with a fistpll.
 284       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
 285
 286     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
 287   }
 288
 289   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
 290   if (!X86ScalarSSEf64) {
 291     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
 292     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
 293     if (Subtarget.is64Bit()) {
 294       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
 295       // Without SSE, i64->f64 goes through memory.
 296       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
 297     }
 298   } else if (!Subtarget.is64Bit())
 299     setOperationAction(ISD::BITCAST      , MVT::i64  , Custom);
 300
 301   // Scalar integer divide and remainder are lowered to use operations that
 302   // produce two results, to match the available instructions. This exposes
 303   // the two-result form to trivial CSE, which is able to combine x/y and x%y
 304   // into a single instruction.
 305   //
 306   // Scalar integer multiply-high is also lowered to use two-result
 307   // operations, to match the available instructions. However, plain multiply
 308   // (low) operations are left as Legal, as there are single-result
 309   // instructions for this in x86. Using the two-result multiply instructions
 310   // when both high and low results are needed must be arranged by dagcombine.
 311   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 312     setOperationAction(ISD::MULHS, VT, Expand);
 313     setOperationAction(ISD::MULHU, VT, Expand);
 314     setOperationAction(ISD::SDIV, VT, Expand);
 315     setOperationAction(ISD::UDIV, VT, Expand);
 316     setOperationAction(ISD::SREM, VT, Expand);
 317     setOperationAction(ISD::UREM, VT, Expand);
 318   }
 319
 320   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
 321   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
 322   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
 323                    MVT::i8,  MVT::i16, MVT::i32, MVT::i64 }) {
 324     setOperationAction(ISD::BR_CC,     VT, Expand);
 325     setOperationAction(ISD::SELECT_CC, VT, Expand);
 326   }
 327   if (Subtarget.is64Bit())
 328     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
 329   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
 330   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
 331   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
 332   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
 333
 334   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
 335   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
 336   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
 337   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
 338
 339   // Promote the i8 variants and force them on up to i32 which has a shorter
 340   // encoding.
 341   setOperationPromotedToType(ISD::CTTZ           , MVT::i8   , MVT::i32);
 342   setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 343   if (!Subtarget.hasBMI()) {
 344     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
 345     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
 346     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Legal);
 347     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Legal);
 348     if (Subtarget.is64Bit()) {
 349       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
 350       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
 351     }
 352   }
 353
 354   if (Subtarget.hasLZCNT()) {
 355     // When promoting the i8 variants, force them to i32 for a shorter
 356     // encoding.
 357     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
 358     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
 359   } else {
 360     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
 361     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
 362     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
 363     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
 364     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
 365     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
 366     if (Subtarget.is64Bit()) {
 367       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
 368       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
 369     }
 370   }
 371
 372   // Special handling for half-precision floating point conversions.
 373   // If we don't have F16C support, then lower half float conversions
 374   // into library calls.
 375   if (Subtarget.useSoftFloat() ||
 376       (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
 377     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
 378     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
 379   }
 380
 381   // There's never any support for operations beyond MVT::f32.
 382   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
 383   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
 384   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
 385   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
 386
 387   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
 388   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
 389   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
 390   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 391   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 392   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
 393
 394   if (Subtarget.hasPOPCNT()) {
 395     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
 396   } else {
 397     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
 398     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
 399     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
 400     if (Subtarget.is64Bit())
 401       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
 402   }
 403
 404   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
 405
 406   if (!Subtarget.hasMOVBE())
 407     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
 408
 409   // These should be promoted to a larger select which is supported.
 410   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
 411   // X86 wants to expand cmov itself.
 412   for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
 413     setOperationAction(ISD::SELECT, VT, Custom);
 414     setOperationAction(ISD::SETCC, VT, Custom);
 415   }
 416   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 417     if (VT == MVT::i64 && !Subtarget.is64Bit())
 418       continue;
 419     setOperationAction(ISD::SELECT, VT, Custom);
 420     setOperationAction(ISD::SETCC,  VT, Custom);
 421   }
 422
 423   // Custom action for SELECT MMX and expand action for SELECT_CC MMX
 424   setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
 425   setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
 426
 427   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
 428   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
 429   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
 430   // support continuation, user-level threading, and etc.. As a result, no
 431   // other SjLj exception interfaces are implemented and please don't build
 432   // your own exception handling based on them.
 433   // LLVM/Clang supports zero-cost DWARF exception handling.
 434   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
 435   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 436   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
 437   if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
 438     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 439
 440   // Darwin ABI issue.
 441   for (auto VT : { MVT::i32, MVT::i64 }) {
 442     if (VT == MVT::i64 && !Subtarget.is64Bit())
 443       continue;
 444     setOperationAction(ISD::ConstantPool    , VT, Custom);
 445     setOperationAction(ISD::JumpTable       , VT, Custom);
 446     setOperationAction(ISD::GlobalAddress   , VT, Custom);
 447     setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
 448     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
 449     setOperationAction(ISD::BlockAddress    , VT, Custom);
 450   }
 451
 452   // 64-bit shl, sra, srl (iff 32-bit x86)
 453   for (auto VT : { MVT::i32, MVT::i64 }) {
 454     if (VT == MVT::i64 && !Subtarget.is64Bit())
 455       continue;
 456     setOperationAction(ISD::SHL_PARTS, VT, Custom);
 457     setOperationAction(ISD::SRA_PARTS, VT, Custom);
 458     setOperationAction(ISD::SRL_PARTS, VT, Custom);
 459   }
 460
 461   if (Subtarget.hasSSE1())
 462     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
 463
 464   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
 465
 466   // Expand certain atomics
 467   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
 468     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
 469     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
 470     setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
 471     setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
 472     setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
 473     setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
 474     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
 475   }
 476
 477   if (Subtarget.hasCmpxchg16b()) {
 478     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
 479   }
 480
 481   // FIXME - use subtarget debug flags
 482   if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
 483       !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
 484       TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
 485     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
 486   }
 487
 488   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
 489   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
 490
 491   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
 492   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 493
 494   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 495   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 496
 497   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
 498   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
 499   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
 500   bool Is64Bit = Subtarget.is64Bit();
 501   setOperationAction(ISD::VAARG,  MVT::Other, Is64Bit ? Custom : Expand);
 502   setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
 503
 504   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
 505   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 506
 507   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
 508
 509   // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
 510   setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
 511   setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
 512
 513   if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
 514     // f32 and f64 use SSE.
 515     // Set up the FP register classes.
 516     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 517                                                      : &X86::FR32RegClass);
 518     addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
 519                                                      : &X86::FR64RegClass);
 520
 521     for (auto VT : { MVT::f32, MVT::f64 }) {
 522       // Use ANDPD to simulate FABS.
 523       setOperationAction(ISD::FABS, VT, Custom);
 524
 525       // Use XORP to simulate FNEG.
 526       setOperationAction(ISD::FNEG, VT, Custom);
 527
 528       // Use ANDPD and ORPD to simulate FCOPYSIGN.
 529       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
 530
 531       // We don't support sin/cos/fmod
 532       setOperationAction(ISD::FSIN   , VT, Expand);
 533       setOperationAction(ISD::FCOS   , VT, Expand);
 534       setOperationAction(ISD::FSINCOS, VT, Expand);
 535     }
 536
 537     // Lower this to MOVMSK plus an AND.
 538     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
 539     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 540
 541     // Expand FP immediates into loads from the stack, except for the special
 542     // cases we handle.
 543     addLegalFPImmediate(APFloat(+0.0)); // xorpd
 544     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 545   } else if (UseX87 && X86ScalarSSEf32) {
 546     // Use SSE for f32, x87 for f64.
 547     // Set up the FP register classes.
 548     addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
 549                                                      : &X86::FR32RegClass);
 550     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 551
 552     // Use ANDPS to simulate FABS.
 553     setOperationAction(ISD::FABS , MVT::f32, Custom);
 554
 555     // Use XORP to simulate FNEG.
 556     setOperationAction(ISD::FNEG , MVT::f32, Custom);
 557
 558     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
 559
 560     // Use ANDPS and ORPS to simulate FCOPYSIGN.
 561     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 562     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 563
 564     // We don't support sin/cos/fmod
 565     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
 566     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
 567     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 568
 569     // Special cases we handle for FP constants.
 570     addLegalFPImmediate(APFloat(+0.0f)); // xorps
 571     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 572     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 573     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 574     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 575
 576     if (!TM.Options.UnsafeFPMath) {
 577       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
 578       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
 579       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
 580     }
 581   } else if (UseX87) {
 582     // f32 and f64 in x87.
 583     // Set up the FP register classes.
 584     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
 585     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
 586
 587     for (auto VT : { MVT::f32, MVT::f64 }) {
 588       setOperationAction(ISD::UNDEF,     VT, Expand);
 589       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 590
 591       if (!TM.Options.UnsafeFPMath) {
 592         setOperationAction(ISD::FSIN   , VT, Expand);
 593         setOperationAction(ISD::FCOS   , VT, Expand);
 594         setOperationAction(ISD::FSINCOS, VT, Expand);
 595       }
 596     }
 597     addLegalFPImmediate(APFloat(+0.0)); // FLD0
 598     addLegalFPImmediate(APFloat(+1.0)); // FLD1
 599     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
 600     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
 601     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
 602     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
 603     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
 604     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
 605   }
 606
 607   // We don't support FMA.
 608   setOperationAction(ISD::FMA, MVT::f64, Expand);
 609   setOperationAction(ISD::FMA, MVT::f32, Expand);
 610
 611   // Long double always uses X87, except f128 in MMX.
 612   if (UseX87) {
 613     if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
 614       addRegisterClass(MVT::f128, &X86::FR128RegClass);
 615       ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
 616       setOperationAction(ISD::FABS , MVT::f128, Custom);
 617       setOperationAction(ISD::FNEG , MVT::f128, Custom);
 618       setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
 619     }
 620
 621     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
 622     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
 623     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
 624     {
 625       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
 626       addLegalFPImmediate(TmpFlt);  // FLD0
 627       TmpFlt.changeSign();
 628       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
 629
 630       bool ignored;
 631       APFloat TmpFlt2(+1.0);
 632       TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
 633                       &ignored);
 634       addLegalFPImmediate(TmpFlt2);  // FLD1
 635       TmpFlt2.changeSign();
 636       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
 637     }
 638
 639     if (!TM.Options.UnsafeFPMath) {
 640       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
 641       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
 642       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
 643     }
 644
 645     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
 646     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
 647     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
 648     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
 649     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
 650     setOperationAction(ISD::FMA, MVT::f80, Expand);
 651   }
 652
 653   // Always use a library call for pow.
 654   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
 655   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
 656   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
 657
 658   setOperationAction(ISD::FLOG, MVT::f80, Expand);
 659   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
 660   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
 661   setOperationAction(ISD::FEXP, MVT::f80, Expand);
 662   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
 663   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
 664   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
 665
 666   // Some FP actions are always expanded for vector types.
 667   for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
 668                    MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
 669     setOperationAction(ISD::FSIN,      VT, Expand);
 670     setOperationAction(ISD::FSINCOS,   VT, Expand);
 671     setOperationAction(ISD::FCOS,      VT, Expand);
 672     setOperationAction(ISD::FREM,      VT, Expand);
 673     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 674     setOperationAction(ISD::FPOW,      VT, Expand);
 675     setOperationAction(ISD::FLOG,      VT, Expand);
 676     setOperationAction(ISD::FLOG2,     VT, Expand);
 677     setOperationAction(ISD::FLOG10,    VT, Expand);
 678     setOperationAction(ISD::FEXP,      VT, Expand);
 679     setOperationAction(ISD::FEXP2,     VT, Expand);
 680   }
 681
 682   // First set operation action for all vector types to either promote
 683   // (for widening) or expand (for scalarization). Then we will selectively
 684   // turn on ones that can be effectively codegen'd.
 685   for (MVT VT : MVT::vector_valuetypes()) {
 686     setOperationAction(ISD::SDIV, VT, Expand);
 687     setOperationAction(ISD::UDIV, VT, Expand);
 688     setOperationAction(ISD::SREM, VT, Expand);
 689     setOperationAction(ISD::UREM, VT, Expand);
 690     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
 691     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
 692     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
 693     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
 694     setOperationAction(ISD::FMA,  VT, Expand);
 695     setOperationAction(ISD::FFLOOR, VT, Expand);
 696     setOperationAction(ISD::FCEIL, VT, Expand);
 697     setOperationAction(ISD::FTRUNC, VT, Expand);
 698     setOperationAction(ISD::FRINT, VT, Expand);
 699     setOperationAction(ISD::FNEARBYINT, VT, Expand);
 700     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
 701     setOperationAction(ISD::MULHS, VT, Expand);
 702     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 703     setOperationAction(ISD::MULHU, VT, Expand);
 704     setOperationAction(ISD::SDIVREM, VT, Expand);
 705     setOperationAction(ISD::UDIVREM, VT, Expand);
 706     setOperationAction(ISD::CTPOP, VT, Expand);
 707     setOperationAction(ISD::CTTZ, VT, Expand);
 708     setOperationAction(ISD::CTLZ, VT, Expand);
 709     setOperationAction(ISD::ROTL, VT, Expand);
 710     setOperationAction(ISD::ROTR, VT, Expand);
 711     setOperationAction(ISD::BSWAP, VT, Expand);
 712     setOperationAction(ISD::SETCC, VT, Expand);
 713     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
 714     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
 715     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
 716     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
 717     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
 718     setOperationAction(ISD::TRUNCATE, VT, Expand);
 719     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
 720     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
 721     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
 722     setOperationAction(ISD::SELECT_CC, VT, Expand);
 723     for (MVT InnerVT : MVT::vector_valuetypes()) {
 724       setTruncStoreAction(InnerVT, VT, Expand);
 725
 726       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
 727       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
 728
 729       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
 730       // types, we have to deal with them whether we ask for Expansion or not.
 731       // Setting Expand causes its own optimisation problems though, so leave
 732       // them legal.
 733       if (VT.getVectorElementType() == MVT::i1)
 734         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 735
 736       // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
 737       // split/scalarized right now.
 738       if (VT.getVectorElementType() == MVT::f16)
 739         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 740     }
 741   }
 742
 743   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
 744   // with -msoft-float, disable use of MMX as well.
 745   if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
 746     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
 747     // No operations on x86mmx supported, everything uses intrinsics.
 748   }
 749
 750   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
 751     addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 752                                                     : &X86::VR128RegClass);
 753
 754     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
 755     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
 756     setOperationAction(ISD::FCOPYSIGN,          MVT::v4f32, Custom);
 757     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
 758     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
 759     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
 760     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
 761     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
 762     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
 763   }
 764
 765   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
 766     addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
 767                                                     : &X86::VR128RegClass);
 768
 769     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
 770     // registers cannot be used even for integer operations.
 771     addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
 772                                                     : &X86::VR128RegClass);
 773     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
 774                                                     : &X86::VR128RegClass);
 775     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
 776                                                     : &X86::VR128RegClass);
 777     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
 778                                                     : &X86::VR128RegClass);
 779
 780     setOperationAction(ISD::MUL,                MVT::v16i8, Custom);
 781     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
 782     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
 783     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
 784     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
 785     setOperationAction(ISD::MULHU,              MVT::v16i8, Custom);
 786     setOperationAction(ISD::MULHS,              MVT::v16i8, Custom);
 787     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
 788     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
 789     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
 790     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
 791     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 792     setOperationAction(ISD::FCOPYSIGN,          MVT::v2f64, Custom);
 793
 794     setOperationAction(ISD::SMAX,               MVT::v8i16, Legal);
 795     setOperationAction(ISD::UMAX,               MVT::v16i8, Legal);
 796     setOperationAction(ISD::SMIN,               MVT::v8i16, Legal);
 797     setOperationAction(ISD::UMIN,               MVT::v16i8, Legal);
 798
 799     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
 800     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
 801     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
 802
 803     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 804       setOperationAction(ISD::SETCC,              VT, Custom);
 805       setOperationAction(ISD::CTPOP,              VT, Custom);
 806       setOperationAction(ISD::CTTZ,               VT, Custom);
 807     }
 808
 809     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 810       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
 811       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 812       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 813       setOperationAction(ISD::VSELECT,            VT, Custom);
 814       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 815     }
 816
 817     // We support custom legalizing of sext and anyext loads for specific
 818     // memory vector types which we can load as a scalar (or sequence of
 819     // scalars) and extend in-register to a legal 128-bit vector type. For sext
 820     // loads these must work with a single scalar load.
 821     for (MVT VT : MVT::integer_vector_valuetypes()) {
 822       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
 823       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
 824       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
 825       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
 826       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
 827       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
 828       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
 829       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
 830       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
 831     }
 832
 833     for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
 834       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
 835       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
 836       setOperationAction(ISD::VSELECT,            VT, Custom);
 837
 838       if (VT == MVT::v2i64 && !Subtarget.is64Bit())
 839         continue;
 840
 841       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
 842       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
 843     }
 844
 845     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
 846     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
 847       setOperationPromotedToType(ISD::AND,    VT, MVT::v2i64);
 848       setOperationPromotedToType(ISD::OR,     VT, MVT::v2i64);
 849       setOperationPromotedToType(ISD::XOR,    VT, MVT::v2i64);
 850       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v2i64);
 851       setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
 852     }
 853
 854     // Custom lower v2i64 and v2f64 selects.
 855     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
 856     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
 857
 858     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
 859     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
 860
 861     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 862     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
 863
 864     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
 865     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
 866     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i32, Custom);
 867
 868     // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
 869     setOperationAction(ISD::UINT_TO_FP,         MVT::v2f32, Custom);
 870
 871     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
 872     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 873
 874     for (MVT VT : MVT::fp_vector_valuetypes())
 875       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
 876
 877     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
 878     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
 879     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
 880
 881     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
 882     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
 883     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
 884
 885     // In the customized shift lowering, the legal v4i32/v2i64 cases
 886     // in AVX2 will be recognized.
 887     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 888       setOperationAction(ISD::SRL,              VT, Custom);
 889       setOperationAction(ISD::SHL,              VT, Custom);
 890       setOperationAction(ISD::SRA,              VT, Custom);
 891     }
 892   }
 893
 894   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
 895     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
 896     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
 897     setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
 898     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
 899     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
 900     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
 901     setOperationAction(ISD::CTLZ,               MVT::v4i32, Custom);
 902     setOperationAction(ISD::CTLZ,               MVT::v2i64, Custom);
 903   }
 904
 905   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
 906     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
 907       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
 908       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
 909       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
 910       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
 911       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
 912     }
 913
 914     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
 915     setOperationAction(ISD::SMAX,               MVT::v4i32, Legal);
 916     setOperationAction(ISD::UMAX,               MVT::v8i16, Legal);
 917     setOperationAction(ISD::UMAX,               MVT::v4i32, Legal);
 918     setOperationAction(ISD::SMIN,               MVT::v16i8, Legal);
 919     setOperationAction(ISD::SMIN,               MVT::v4i32, Legal);
 920     setOperationAction(ISD::UMIN,               MVT::v8i16, Legal);
 921     setOperationAction(ISD::UMIN,               MVT::v4i32, Legal);
 922
 923     // FIXME: Do we need to handle scalar-to-vector here?
 924     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 925
 926     // We directly match byte blends in the backend as they match the VSELECT
 927     // condition form.
 928     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
 929
 930     // SSE41 brings specific instructions for doing vector sign extend even in
 931     // cases where we don't have SRA.
 932     for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
 933       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
 934       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
 935     }
 936
 937     for (MVT VT : MVT::integer_vector_valuetypes()) {
 938       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
 939       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
 940       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
 941     }
 942
 943     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
 944     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
 945       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
 946       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8,  Legal);
 947       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8,  Legal);
 948       setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
 949       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
 950       setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
 951     }
 952
 953     // i8 vectors are custom because the source register and source
 954     // source memory operand types are not the same width.
 955     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
 956   }
 957
 958   if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
 959     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 960                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 961       setOperationAction(ISD::ROTL, VT, Custom);
 962
 963     // XOP can efficiently perform BITREVERSE with VPPERM.
 964     for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
 965       setOperationAction(ISD::BITREVERSE, VT, Custom);
 966
 967     for (auto VT : { MVT::v16i8, MVT::v8i16,  MVT::v4i32, MVT::v2i64,
 968                      MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
 969       setOperationAction(ISD::BITREVERSE, VT, Custom);
 970   }
 971
 972   if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
 973     bool HasInt256 = Subtarget.hasInt256();
 974
 975     addRegisterClass(MVT::v32i8,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 976                                                      : &X86::VR256RegClass);
 977     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
 978                                                      : &X86::VR256RegClass);
 979     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 980                                                      : &X86::VR256RegClass);
 981     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 982                                                      : &X86::VR256RegClass);
 983     addRegisterClass(MVT::v4i64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 984                                                      : &X86::VR256RegClass);
 985     addRegisterClass(MVT::v4f64,  Subtarget.hasVLX() ? &X86::VR256XRegClass
 986                                                      : &X86::VR256RegClass);
 987
 988     for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
 989       setOperationAction(ISD::FFLOOR,     VT, Legal);
 990       setOperationAction(ISD::FCEIL,      VT, Legal);
 991       setOperationAction(ISD::FTRUNC,     VT, Legal);
 992       setOperationAction(ISD::FRINT,      VT, Legal);
 993       setOperationAction(ISD::FNEARBYINT, VT, Legal);
 994       setOperationAction(ISD::FNEG,       VT, Custom);
 995       setOperationAction(ISD::FABS,       VT, Custom);
 996       setOperationAction(ISD::FCOPYSIGN,  VT, Custom);
 997     }
 998
 999     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1000     // even though v8i16 is a legal type.
1001     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
1002     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
1003     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
1004
1005     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
1006     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
1007     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
1008
1009     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
1010     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
1011
1012     for (MVT VT : MVT::fp_vector_valuetypes())
1013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1014
1015     // In the customized shift lowering, the legal v8i32/v4i64 cases
1016     // in AVX2 will be recognized.
1017     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1018       setOperationAction(ISD::SRL, VT, Custom);
1019       setOperationAction(ISD::SHL, VT, Custom);
1020       setOperationAction(ISD::SRA, VT, Custom);
1021     }
1022
1023     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
1024     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
1025     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
1026
1027     for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1028       setOperationAction(ISD::SIGN_EXTEND,     VT, Custom);
1029       setOperationAction(ISD::ZERO_EXTEND,     VT, Custom);
1030       setOperationAction(ISD::ANY_EXTEND,      VT, Custom);
1031     }
1032
1033     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
1034     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
1035     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
1036     setOperationAction(ISD::BITREVERSE,        MVT::v32i8, Custom);
1037
1038     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1039       setOperationAction(ISD::SETCC,           VT, Custom);
1040       setOperationAction(ISD::CTPOP,           VT, Custom);
1041       setOperationAction(ISD::CTTZ,            VT, Custom);
1042       setOperationAction(ISD::CTLZ,            VT, Custom);
1043     }
1044
1045     if (Subtarget.hasAnyFMA()) {
1046       for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1047                        MVT::v2f64, MVT::v4f64 })
1048         setOperationAction(ISD::FMA, VT, Legal);
1049     }
1050
1051     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1052       setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1053       setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054     }
1055
1056     setOperationAction(ISD::MUL,       MVT::v4i64,  Custom);
1057     setOperationAction(ISD::MUL,       MVT::v8i32,  HasInt256 ? Legal : Custom);
1058     setOperationAction(ISD::MUL,       MVT::v16i16, HasInt256 ? Legal : Custom);
1059     setOperationAction(ISD::MUL,       MVT::v32i8,  Custom);
1060
1061     setOperationAction(ISD::UMUL_LOHI, MVT::v8i32,  Custom);
1062     setOperationAction(ISD::SMUL_LOHI, MVT::v8i32,  Custom);
1063
1064     setOperationAction(ISD::MULHU,     MVT::v16i16, HasInt256 ? Legal : Custom);
1065     setOperationAction(ISD::MULHS,     MVT::v16i16, HasInt256 ? Legal : Custom);
1066     setOperationAction(ISD::MULHU,     MVT::v32i8,  Custom);
1067     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
1068
1069     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1070       setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
1071       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1072       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1073       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1074       setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075     }
1076
1077     if (HasInt256) {
1078       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64,  Custom);
1079       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32,  Custom);
1080       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1081
1082       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1083       // when we have a 256bit-wide blend with immediate.
1084       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1085
1086       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1087       for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1088         setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1089         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i8,  Legal);
1090         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i8,  Legal);
1091         setLoadExtAction(LoadExtOp, MVT::v8i32,  MVT::v8i16, Legal);
1092         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i16, Legal);
1093         setLoadExtAction(LoadExtOp, MVT::v4i64,  MVT::v4i32, Legal);
1094       }
1095     }
1096
1097     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1098                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1099       setOperationAction(ISD::MLOAD,  VT, Legal);
1100       setOperationAction(ISD::MSTORE, VT, Legal);
1101     }
1102
1103     // Extract subvector is special because the value type
1104     // (result) is 128-bit but the source is 256-bit wide.
1105     for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1106                      MVT::v4f32, MVT::v2f64 }) {
1107       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1108     }
1109
1110     // Custom lower several nodes for 256-bit types.
1111     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1112                     MVT::v8f32, MVT::v4f64 }) {
1113       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1114       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1115       setOperationAction(ISD::VSELECT,            VT, Custom);
1116       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1117       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1118       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
1119       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
1120       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
1121     }
1122
1123     if (HasInt256)
1124       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
1125
1126     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1127     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1128       setOperationPromotedToType(ISD::AND,    VT, MVT::v4i64);
1129       setOperationPromotedToType(ISD::OR,     VT, MVT::v4i64);
1130       setOperationPromotedToType(ISD::XOR,    VT, MVT::v4i64);
1131       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v4i64);
1132       setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1133     }
1134   }
1135
1136   if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1137     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1138     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1139     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
1140     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
1141
1142     addRegisterClass(MVT::v1i1,   &X86::VK1RegClass);
1143     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
1144     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
1145
1146     for (MVT VT : MVT::fp_vector_valuetypes())
1147       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1148
1149     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1150       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
1151       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1152       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8,  Legal);
1153       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
1154       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
1155       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
1156     }
1157
1158     for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1159                    MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1160                    MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1161       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1162       setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1163       setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1164       setLoadExtAction(ISD::EXTLOAD,  VT, MaskVT, Custom);
1165       setTruncStoreAction(VT, MaskVT, Custom);
1166     }
1167
1168     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1169       setOperationAction(ISD::FNEG,  VT, Custom);
1170       setOperationAction(ISD::FABS,  VT, Custom);
1171       setOperationAction(ISD::FMA,   VT, Legal);
1172       setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1173     }
1174
1175     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
1176     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
1177     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
1178     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
1179     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
1180     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
1181     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
1182     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
1183     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
1184     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
1185     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
1186     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
1187     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
1188     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i8, Custom);
1189     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i16, Custom);
1190     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1, Custom);
1191     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i1, Custom);
1192     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,  Custom);
1193     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i1,  Custom);
1194     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i1,  Custom);
1195     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i1,  Custom);
1196     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i1,  Custom);
1197     setOperationAction(ISD::UINT_TO_FP,         MVT::v2i1,  Custom);
1198     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
1199     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
1200
1201     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
1202     setTruncStoreAction(MVT::v8i64,   MVT::v8i16,  Legal);
1203     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
1204     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
1205     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
1206     if (Subtarget.hasVLX()){
1207       setTruncStoreAction(MVT::v4i64, MVT::v4i8,  Legal);
1208       setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1209       setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1210       setTruncStoreAction(MVT::v8i32, MVT::v8i8,  Legal);
1211       setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1212
1213       setTruncStoreAction(MVT::v2i64, MVT::v2i8,  Legal);
1214       setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1215       setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1216       setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
1217       setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1218     } else {
1219       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1220            MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1221         setOperationAction(ISD::MLOAD,  VT, Custom);
1222         setOperationAction(ISD::MSTORE, VT, Custom);
1223       }
1224     }
1225     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
1226     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
1227
1228     if (Subtarget.hasDQI()) {
1229       for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1230         setOperationAction(ISD::SINT_TO_FP,     VT, Legal);
1231         setOperationAction(ISD::UINT_TO_FP,     VT, Legal);
1232         setOperationAction(ISD::FP_TO_SINT,     VT, Legal);
1233         setOperationAction(ISD::FP_TO_UINT,     VT, Legal);
1234       }
1235       if (Subtarget.hasVLX()) {
1236         // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1237         setOperationAction(ISD::SINT_TO_FP,    MVT::v2f32, Custom);
1238         setOperationAction(ISD::FP_TO_SINT,    MVT::v2f32, Custom);
1239         setOperationAction(ISD::FP_TO_UINT,    MVT::v2f32, Custom);
1240       }
1241     }
1242     if (Subtarget.hasVLX()) {
1243       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
1244       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
1245       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
1246       setOperationAction(ISD::FP_TO_UINT,       MVT::v8i32, Legal);
1247       setOperationAction(ISD::SINT_TO_FP,       MVT::v4i32, Legal);
1248       setOperationAction(ISD::FP_TO_SINT,       MVT::v4i32, Legal);
1249       setOperationAction(ISD::FP_TO_UINT,       MVT::v4i32, Legal);
1250       setOperationAction(ISD::ZERO_EXTEND,      MVT::v4i32, Custom);
1251       setOperationAction(ISD::ZERO_EXTEND,      MVT::v2i64, Custom);
1252       setOperationAction(ISD::SIGN_EXTEND,      MVT::v4i32, Custom);
1253       setOperationAction(ISD::SIGN_EXTEND,      MVT::v2i64, Custom);
1254
1255       // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1256       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8,  Legal);
1257       setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1258       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
1259       setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1260       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8,  Legal);
1261       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1262       setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1263       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
1264       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1265       setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1266     }
1267
1268     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
1269     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
1270     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
1271     setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
1272     setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
1273     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
1274     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
1275     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
1276     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
1277     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
1278
1279     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1280       setOperationAction(ISD::FFLOOR,           VT, Legal);
1281       setOperationAction(ISD::FCEIL,            VT, Legal);
1282       setOperationAction(ISD::FTRUNC,           VT, Legal);
1283       setOperationAction(ISD::FRINT,            VT, Legal);
1284       setOperationAction(ISD::FNEARBYINT,       VT, Legal);
1285     }
1286
1287     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64,  Custom);
1288     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1289
1290     // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1291     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1292     setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1293
1294     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
1295     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
1296     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
1297     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
1298     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1,   Custom);
1299
1300     setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
1301
1302     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1303     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v16i1, Custom);
1304     setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i1, Custom);
1305     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
1306     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
1307     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
1308
1309     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
1310
1311     // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1312     setOperationAction(ISD::ABS,                MVT::v4i64, Legal);
1313     setOperationAction(ISD::ABS,                MVT::v2i64, Legal);
1314
1315     for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1316       setOperationAction(ISD::ADD,              VT, Custom);
1317       setOperationAction(ISD::SUB,              VT, Custom);
1318       setOperationAction(ISD::MUL,              VT, Custom);
1319       setOperationAction(ISD::SETCC,            VT, Custom);
1320       setOperationAction(ISD::SELECT,           VT, Custom);
1321       setOperationAction(ISD::TRUNCATE,         VT, Custom);
1322
1323       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
1324       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1325       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1326       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
1327       setOperationAction(ISD::VSELECT,          VT,  Expand);
1328     }
1329
1330     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331       setOperationAction(ISD::SMAX,             VT, Legal);
1332       setOperationAction(ISD::UMAX,             VT, Legal);
1333       setOperationAction(ISD::SMIN,             VT, Legal);
1334       setOperationAction(ISD::UMIN,             VT, Legal);
1335       setOperationAction(ISD::ABS,              VT, Legal);
1336       setOperationAction(ISD::SRL,              VT, Custom);
1337       setOperationAction(ISD::SHL,              VT, Custom);
1338       setOperationAction(ISD::SRA,              VT, Custom);
1339       setOperationAction(ISD::CTPOP,            VT, Custom);
1340       setOperationAction(ISD::CTTZ,             VT, Custom);
1341     }
1342
1343     // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1344     for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1345                     MVT::v8i64}) {
1346       setOperationAction(ISD::ROTL,             VT, Custom);
1347       setOperationAction(ISD::ROTR,             VT, Custom);
1348     }
1349
1350     // Need to promote to 64-bit even though we have 32-bit masked instructions
1351     // because the IR optimizers rearrange bitcasts around logic ops leaving
1352     // too many variations to handle if we don't promote them.
1353     setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1354     setOperationPromotedToType(ISD::OR,  MVT::v16i32, MVT::v8i64);
1355     setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1356
1357     if (Subtarget.hasCDI()) {
1358       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1359       for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1360                       MVT::v4i64, MVT::v8i64}) {
1361         setOperationAction(ISD::CTLZ,            VT, Legal);
1362         setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1363       }
1364     } // Subtarget.hasCDI()
1365
1366     if (Subtarget.hasDQI()) {
1367       // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1368       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
1369       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
1370       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
1371     }
1372
1373     if (Subtarget.hasVPOPCNTDQ()) {
1374       // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1375       // version of popcntd/q.
1376       for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1377                       MVT::v4i32, MVT::v2i64})
1378         setOperationAction(ISD::CTPOP, VT, Legal);
1379     }
1380
1381     // Custom lower several nodes.
1382     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1383                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1384       setOperationAction(ISD::MGATHER,  VT, Custom);
1385       setOperationAction(ISD::MSCATTER, VT, Custom);
1386     }
1387     // Extract subvector is special because the value type
1388     // (result) is 256-bit but the source is 512-bit wide.
1389     // 128-bit was made Custom under AVX1.
1390     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1391                      MVT::v8f32, MVT::v4f64, MVT::v1i1 })
1392       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1393     for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1394                      MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1395       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1396
1397     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398       setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
1399       setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
1400       setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
1401       setOperationAction(ISD::VSELECT,             VT, Custom);
1402       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
1403       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
1404       setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
1405       setOperationAction(ISD::MLOAD,               VT, Legal);
1406       setOperationAction(ISD::MSTORE,              VT, Legal);
1407       setOperationAction(ISD::MGATHER,             VT, Legal);
1408       setOperationAction(ISD::MSCATTER,            VT, Custom);
1409     }
1410     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411       setOperationPromotedToType(ISD::LOAD,   VT, MVT::v8i64);
1412       setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1413     }
1414   }// has  AVX-512
1415
1416   if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1417     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1418     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
1419
1420     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
1421     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
1422
1423     setOperationAction(ISD::ADD,                MVT::v32i1, Custom);
1424     setOperationAction(ISD::ADD,                MVT::v64i1, Custom);
1425     setOperationAction(ISD::SUB,                MVT::v32i1, Custom);
1426     setOperationAction(ISD::SUB,                MVT::v64i1, Custom);
1427     setOperationAction(ISD::MUL,                MVT::v32i1, Custom);
1428     setOperationAction(ISD::MUL,                MVT::v64i1, Custom);
1429
1430     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
1431     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
1432     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
1433     setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
1434     setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
1435     setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
1436     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
1437     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
1438     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
1439     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
1440     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
1441     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
1442     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
1443     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
1444     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1445     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1446     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
1447     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1448     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
1449     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
1450     setOperationAction(ISD::SELECT,             MVT::v32i1, Custom);
1451     setOperationAction(ISD::SELECT,             MVT::v64i1, Custom);
1452     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
1453     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
1454     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
1455     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
1456     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
1457     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
1458     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
1459     setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
1460     setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
1461     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i1, Custom);
1462     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i1, Custom);
1463     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
1464     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
1465     setOperationAction(ISD::TRUNCATE,           MVT::v32i1, Custom);
1466     setOperationAction(ISD::TRUNCATE,           MVT::v64i1, Custom);
1467     setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
1468     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i1, Custom);
1469     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i1, Custom);
1470     setOperationAction(ISD::BUILD_VECTOR,       MVT::v32i1, Custom);
1471     setOperationAction(ISD::BUILD_VECTOR,       MVT::v64i1, Custom);
1472     setOperationAction(ISD::VSELECT,            MVT::v32i1, Expand);
1473     setOperationAction(ISD::VSELECT,            MVT::v64i1, Expand);
1474     setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
1475
1476     setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1477
1478     setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
1479     if (Subtarget.hasVLX()) {
1480       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
1481       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
1482     }
1483
1484     LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1485     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1486       setOperationAction(ISD::MLOAD,               VT, Action);
1487       setOperationAction(ISD::MSTORE,              VT, Action);
1488     }
1489
1490     if (Subtarget.hasCDI()) {
1491       setOperationAction(ISD::CTLZ,            MVT::v32i16, Custom);
1492       setOperationAction(ISD::CTLZ,            MVT::v64i8,  Custom);
1493     }
1494
1495     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1496       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1497       setOperationAction(ISD::VSELECT,      VT, Custom);
1498       setOperationAction(ISD::ABS,          VT, Legal);
1499       setOperationAction(ISD::SRL,          VT, Custom);
1500       setOperationAction(ISD::SHL,          VT, Custom);
1501       setOperationAction(ISD::SRA,          VT, Custom);
1502       setOperationAction(ISD::MLOAD,        VT, Legal);
1503       setOperationAction(ISD::MSTORE,       VT, Legal);
1504       setOperationAction(ISD::CTPOP,        VT, Custom);
1505       setOperationAction(ISD::CTTZ,         VT, Custom);
1506       setOperationAction(ISD::SMAX,         VT, Legal);
1507       setOperationAction(ISD::UMAX,         VT, Legal);
1508       setOperationAction(ISD::SMIN,         VT, Legal);
1509       setOperationAction(ISD::UMIN,         VT, Legal);
1510
1511       setOperationPromotedToType(ISD::AND,  VT, MVT::v8i64);
1512       setOperationPromotedToType(ISD::OR,   VT, MVT::v8i64);
1513       setOperationPromotedToType(ISD::XOR,  VT, MVT::v8i64);
1514     }
1515
1516     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1517       setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1518       if (Subtarget.hasVLX()) {
1519         // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1520         setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1521         setLoadExtAction(ExtType, MVT::v8i16,  MVT::v8i8,  Legal);
1522       }
1523     }
1524   }
1525
1526   if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1527     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
1528     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
1529
1530     for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1531       setOperationAction(ISD::ADD,                VT, Custom);
1532       setOperationAction(ISD::SUB,                VT, Custom);
1533       setOperationAction(ISD::MUL,                VT, Custom);
1534       setOperationAction(ISD::VSELECT,            VT, Expand);
1535
1536       setOperationAction(ISD::TRUNCATE,           VT, Custom);
1537       setOperationAction(ISD::SETCC,              VT, Custom);
1538       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1539       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
1540       setOperationAction(ISD::SELECT,             VT, Custom);
1541       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
1542       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
1543     }
1544
1545     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
1546     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
1547     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
1548     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
1549
1550     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1551       setOperationAction(ISD::SMAX, VT, Legal);
1552       setOperationAction(ISD::UMAX, VT, Legal);
1553       setOperationAction(ISD::SMIN, VT, Legal);
1554       setOperationAction(ISD::UMIN, VT, Legal);
1555     }
1556   }
1557
1558   // We want to custom lower some of our intrinsics.
1559   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1560   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1561   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1562   if (!Subtarget.is64Bit()) {
1563     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1564     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1565   }
1566
1567   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1568   // handle type legalization for these operations here.
1569   //
1570   // FIXME: We really should do custom legalization for addition and
1571   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
1572   // than generic legalization for 64-bit multiplication-with-overflow, though.
1573   for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1574     if (VT == MVT::i64 && !Subtarget.is64Bit())
1575       continue;
1576     // Add/Sub/Mul with overflow operations are custom lowered.
1577     setOperationAction(ISD::SADDO, VT, Custom);
1578     setOperationAction(ISD::UADDO, VT, Custom);
1579     setOperationAction(ISD::SSUBO, VT, Custom);
1580     setOperationAction(ISD::USUBO, VT, Custom);
1581     setOperationAction(ISD::SMULO, VT, Custom);
1582     setOperationAction(ISD::UMULO, VT, Custom);
1583
1584     // Support carry in as value rather than glue.
1585     setOperationAction(ISD::ADDCARRY, VT, Custom);
1586     setOperationAction(ISD::SUBCARRY, VT, Custom);
1587     setOperationAction(ISD::SETCCCARRY, VT, Custom);
1588   }
1589
1590   if (!Subtarget.is64Bit()) {
1591     // These libcalls are not available in 32-bit.
1592     setLibcallName(RTLIB::SHL_I128, nullptr);
1593     setLibcallName(RTLIB::SRL_I128, nullptr);
1594     setLibcallName(RTLIB::SRA_I128, nullptr);
1595   }
1596
1597   // Combine sin / cos into one node or libcall if possible.
1598   if (Subtarget.hasSinCos()) {
1599     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1600     setLibcallName(RTLIB::SINCOS_F64, "sincos");
1601     if (Subtarget.isTargetDarwin()) {
1602       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1603       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1604       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1605       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1606     }
1607   }
1608
1609   if (Subtarget.isTargetWin64()) {
1610     setOperationAction(ISD::SDIV, MVT::i128, Custom);
1611     setOperationAction(ISD::UDIV, MVT::i128, Custom);
1612     setOperationAction(ISD::SREM, MVT::i128, Custom);
1613     setOperationAction(ISD::UREM, MVT::i128, Custom);
1614     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1615     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1616   }
1617
1618   // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1619   // is. We should promote the value to 64-bits to solve this.
1620   // This is what the CRT headers do - `fmodf` is an inline header
1621   // function casting to f64 and calling `fmod`.
1622   if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1623                               Subtarget.isTargetWindowsItanium()))
1624     for (ISD::NodeType Op :
1625          {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1626           ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1627       if (isOperationExpand(Op, MVT::f32))
1628         setOperationAction(Op, MVT::f32, Promote);
1629
1630   // We have target-specific dag combine patterns for the following nodes:
1631   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1632   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1633   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1634   setTargetDAGCombine(ISD::BITCAST);
1635   setTargetDAGCombine(ISD::VSELECT);
1636   setTargetDAGCombine(ISD::SELECT);
1637   setTargetDAGCombine(ISD::SHL);
1638   setTargetDAGCombine(ISD::SRA);
1639   setTargetDAGCombine(ISD::SRL);
1640   setTargetDAGCombine(ISD::OR);
1641   setTargetDAGCombine(ISD::AND);
1642   setTargetDAGCombine(ISD::ADD);
1643   setTargetDAGCombine(ISD::FADD);
1644   setTargetDAGCombine(ISD::FSUB);
1645   setTargetDAGCombine(ISD::FNEG);
1646   setTargetDAGCombine(ISD::FMA);
1647   setTargetDAGCombine(ISD::FMINNUM);
1648   setTargetDAGCombine(ISD::FMAXNUM);
1649   setTargetDAGCombine(ISD::SUB);
1650   setTargetDAGCombine(ISD::LOAD);
1651   setTargetDAGCombine(ISD::MLOAD);
1652   setTargetDAGCombine(ISD::STORE);
1653   setTargetDAGCombine(ISD::MSTORE);
1654   setTargetDAGCombine(ISD::TRUNCATE);
1655   setTargetDAGCombine(ISD::ZERO_EXTEND);
1656   setTargetDAGCombine(ISD::ANY_EXTEND);
1657   setTargetDAGCombine(ISD::SIGN_EXTEND);
1658   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659   setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1660   setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1661   setTargetDAGCombine(ISD::SINT_TO_FP);
1662   setTargetDAGCombine(ISD::UINT_TO_FP);
1663   setTargetDAGCombine(ISD::SETCC);
1664   setTargetDAGCombine(ISD::MUL);
1665   setTargetDAGCombine(ISD::XOR);
1666   setTargetDAGCombine(ISD::MSCATTER);
1667   setTargetDAGCombine(ISD::MGATHER);
1668
1669   computeRegisterProperties(Subtarget.getRegisterInfo());
1670
1671   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1672   MaxStoresPerMemsetOptSize = 8;
1673   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1674   MaxStoresPerMemcpyOptSize = 4;
1675   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1676   MaxStoresPerMemmoveOptSize = 4;
1677
1678   // TODO: These control memcmp expansion in CGP and could be raised higher, but
1679   // that needs to benchmarked and balanced with the potential use of vector
1680   // load/store types (PR33329, PR33914).
1681   MaxLoadsPerMemcmp = 2;
1682   MaxLoadsPerMemcmpOptSize = 2;
1683
1684   // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1685   setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1686
1687   // An out-of-order CPU can speculatively execute past a predictable branch,
1688   // but a conditional move could be stalled by an expensive earlier operation.
1689   PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1690   EnableExtLdPromotion = true;
1691   setPrefFunctionAlignment(4); // 2^4 bytes.
1692
1693   verifyIntrinsicTables();
1694 }
1695
1696 // This has so far only been implemented for 64-bit MachO.
1697 bool X86TargetLowering::useLoadStackGuardNode() const {
1698   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1699 }
1700
1701 TargetLoweringBase::LegalizeTypeAction
1702 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1703   if (ExperimentalVectorWideningLegalization &&
1704       VT.getVectorNumElements() != 1 &&
1705       VT.getVectorElementType().getSimpleVT() != MVT::i1)
1706     return TypeWidenVector;
1707
1708   return TargetLoweringBase::getPreferredVectorAction(VT);
1709 }
1710
1711 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1712                                           LLVMContext& Context,
1713                                           EVT VT) const {
1714   if (!VT.isVector())
1715     return MVT::i8;
1716
1717   if (VT.isSimple()) {
1718     MVT VVT = VT.getSimpleVT();
1719     const unsigned NumElts = VVT.getVectorNumElements();
1720     MVT EltVT = VVT.getVectorElementType();
1721     if (VVT.is512BitVector()) {
1722       if (Subtarget.hasAVX512())
1723         if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1724             EltVT == MVT::f32 || EltVT == MVT::f64)
1725           switch(NumElts) {
1726           case  8: return MVT::v8i1;
1727           case 16: return MVT::v16i1;
1728         }
1729       if (Subtarget.hasBWI())
1730         if (EltVT == MVT::i8 || EltVT == MVT::i16)
1731           switch(NumElts) {
1732           case 32: return MVT::v32i1;
1733           case 64: return MVT::v64i1;
1734         }
1735     }
1736
1737     if (Subtarget.hasBWI() && Subtarget.hasVLX())
1738       return MVT::getVectorVT(MVT::i1, NumElts);
1739
1740     if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1741       EVT LegalVT = getTypeToTransformTo(Context, VT);
1742       EltVT = LegalVT.getVectorElementType().getSimpleVT();
1743     }
1744
1745     if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1746       switch(NumElts) {
1747       case 2: return MVT::v2i1;
1748       case 4: return MVT::v4i1;
1749       case 8: return MVT::v8i1;
1750       }
1751   }
1752
1753   return VT.changeVectorElementTypeToInteger();
1754 }
1755
1756 /// Helper for getByValTypeAlignment to determine
1757 /// the desired ByVal argument alignment.
1758 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1759   if (MaxAlign == 16)
1760     return;
1761   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1762     if (VTy->getBitWidth() == 128)
1763       MaxAlign = 16;
1764   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1765     unsigned EltAlign = 0;
1766     getMaxByValAlign(ATy->getElementType(), EltAlign);
1767     if (EltAlign > MaxAlign)
1768       MaxAlign = EltAlign;
1769   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1770     for (auto *EltTy : STy->elements()) {
1771       unsigned EltAlign = 0;
1772       getMaxByValAlign(EltTy, EltAlign);
1773       if (EltAlign > MaxAlign)
1774         MaxAlign = EltAlign;
1775       if (MaxAlign == 16)
1776         break;
1777     }
1778   }
1779 }
1780
1781 /// Return the desired alignment for ByVal aggregate
1782 /// function arguments in the caller parameter area. For X86, aggregates
1783 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1784 /// are at 4-byte boundaries.
1785 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1786                                                   const DataLayout &DL) const {
1787   if (Subtarget.is64Bit()) {
1788     // Max of 8 and alignment of type.
1789     unsigned TyAlign = DL.getABITypeAlignment(Ty);
1790     if (TyAlign > 8)
1791       return TyAlign;
1792     return 8;
1793   }
1794
1795   unsigned Align = 4;
1796   if (Subtarget.hasSSE1())
1797     getMaxByValAlign(Ty, Align);
1798   return Align;
1799 }
1800
1801 /// Returns the target specific optimal type for load
1802 /// and store operations as a result of memset, memcpy, and memmove
1803 /// lowering. If DstAlign is zero that means it's safe to destination
1804 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1805 /// means there isn't a need to check it against alignment requirement,
1806 /// probably because the source does not need to be loaded. If 'IsMemset' is
1807 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1808 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1809 /// source is constant so it does not need to be loaded.
1810 /// It returns EVT::Other if the type should be determined using generic
1811 /// target-independent logic.
1812 EVT
1813 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1814                                        unsigned DstAlign, unsigned SrcAlign,
1815                                        bool IsMemset, bool ZeroMemset,
1816                                        bool MemcpyStrSrc,
1817                                        MachineFunction &MF) const {
1818   const Function *F = MF.getFunction();
1819   if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1820     if (Size >= 16 &&
1821         (!Subtarget.isUnalignedMem16Slow() ||
1822          ((DstAlign == 0 || DstAlign >= 16) &&
1823           (SrcAlign == 0 || SrcAlign >= 16)))) {
1824       // FIXME: Check if unaligned 32-byte accesses are slow.
1825       if (Size >= 32 && Subtarget.hasAVX()) {
1826         // Although this isn't a well-supported type for AVX1, we'll let
1827         // legalization and shuffle lowering produce the optimal codegen. If we
1828         // choose an optimal type with a vector element larger than a byte,
1829         // getMemsetStores() may create an intermediate splat (using an integer
1830         // multiply) before we splat as a vector.
1831         return MVT::v32i8;
1832       }
1833       if (Subtarget.hasSSE2())
1834         return MVT::v16i8;
1835       // TODO: Can SSE1 handle a byte vector?
1836       if (Subtarget.hasSSE1())
1837         return MVT::v4f32;
1838     } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1839                !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1840       // Do not use f64 to lower memcpy if source is string constant. It's
1841       // better to use i32 to avoid the loads.
1842       // Also, do not use f64 to lower memset unless this is a memset of zeros.
1843       // The gymnastics of splatting a byte value into an XMM register and then
1844       // only using 8-byte stores (because this is a CPU with slow unaligned
1845       // 16-byte accesses) makes that a loser.
1846       return MVT::f64;
1847     }
1848   }
1849   // This is a compromise. If we reach here, unaligned accesses may be slow on
1850   // this target. However, creating smaller, aligned accesses could be even
1851   // slower and would certainly be a lot more code.
1852   if (Subtarget.is64Bit() && Size >= 8)
1853     return MVT::i64;
1854   return MVT::i32;
1855 }
1856
1857 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1858   if (VT == MVT::f32)
1859     return X86ScalarSSEf32;
1860   else if (VT == MVT::f64)
1861     return X86ScalarSSEf64;
1862   return true;
1863 }
1864
1865 bool
1866 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1867                                                   unsigned,
1868                                                   unsigned,
1869                                                   bool *Fast) const {
1870   if (Fast) {
1871     switch (VT.getSizeInBits()) {
1872     default:
1873       // 8-byte and under are always assumed to be fast.
1874       *Fast = true;
1875       break;
1876     case 128:
1877       *Fast = !Subtarget.isUnalignedMem16Slow();
1878       break;
1879     case 256:
1880       *Fast = !Subtarget.isUnalignedMem32Slow();
1881       break;
1882     // TODO: What about AVX-512 (512-bit) accesses?
1883     }
1884   }
1885   // Misaligned accesses of any size are always allowed.
1886   return true;
1887 }
1888
1889 /// Return the entry encoding for a jump table in the
1890 /// current function.  The returned value is a member of the
1891 /// MachineJumpTableInfo::JTEntryKind enum.
1892 unsigned X86TargetLowering::getJumpTableEncoding() const {
1893   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1894   // symbol.
1895   if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1896     return MachineJumpTableInfo::EK_Custom32;
1897
1898   // Otherwise, use the normal jump table encoding heuristics.
1899   return TargetLowering::getJumpTableEncoding();
1900 }
1901
1902 bool X86TargetLowering::useSoftFloat() const {
1903   return Subtarget.useSoftFloat();
1904 }
1905
1906 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1907                                               ArgListTy &Args) const {
1908
1909   // Only relabel X86-32 for C / Stdcall CCs.
1910   if (Subtarget.is64Bit())
1911     return;
1912   if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1913     return;
1914   unsigned ParamRegs = 0;
1915   if (auto *M = MF->getFunction()->getParent())
1916     ParamRegs = M->getNumberRegisterParameters();
1917
1918   // Mark the first N int arguments as having reg
1919   for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1920     Type *T = Args[Idx].Ty;
1921     if (T->isPointerTy() || T->isIntegerTy())
1922       if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1923         unsigned numRegs = 1;
1924         if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1925           numRegs = 2;
1926         if (ParamRegs < numRegs)
1927           return;
1928         ParamRegs -= numRegs;
1929         Args[Idx].IsInReg = true;
1930       }
1931   }
1932 }
1933
1934 const MCExpr *
1935 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1936                                              const MachineBasicBlock *MBB,
1937                                              unsigned uid,MCContext &Ctx) const{
1938   assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1939   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1940   // entries.
1941   return MCSymbolRefExpr::create(MBB->getSymbol(),
1942                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
1943 }
1944
1945 /// Returns relocation base for the given PIC jumptable.
1946 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1947                                                     SelectionDAG &DAG) const {
1948   if (!Subtarget.is64Bit())
1949     // This doesn't have SDLoc associated with it, but is not really the
1950     // same as a Register.
1951     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1952                        getPointerTy(DAG.getDataLayout()));
1953   return Table;
1954 }
1955
1956 /// This returns the relocation base for the given PIC jumptable,
1957 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1958 const MCExpr *X86TargetLowering::
1959 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1960                              MCContext &Ctx) const {
1961   // X86-64 uses RIP relative addressing based on the jump table label.
1962   if (Subtarget.isPICStyleRIPRel())
1963     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1964
1965   // Otherwise, the reference is relative to the PIC base.
1966   return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1967 }
1968
1969 std::pair<const TargetRegisterClass *, uint8_t>
1970 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1971                                            MVT VT) const {
1972   const TargetRegisterClass *RRC = nullptr;
1973   uint8_t Cost = 1;
1974   switch (VT.SimpleTy) {
1975   default:
1976     return TargetLowering::findRepresentativeClass(TRI, VT);
1977   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1978     RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1979     break;
1980   case MVT::x86mmx:
1981     RRC = &X86::VR64RegClass;
1982     break;
1983   case MVT::f32: case MVT::f64:
1984   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1985   case MVT::v4f32: case MVT::v2f64:
1986   case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1987   case MVT::v8f32: case MVT::v4f64:
1988   case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1989   case MVT::v16f32: case MVT::v8f64:
1990     RRC = &X86::VR128XRegClass;
1991     break;
1992   }
1993   return std::make_pair(RRC, Cost);
1994 }
1995
1996 unsigned X86TargetLowering::getAddressSpace() const {
1997   if (Subtarget.is64Bit())
1998     return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1999   return 256;
2000 }
2001
2002 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2003   return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2004          (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2005 }
2006
2007 static Constant* SegmentOffset(IRBuilder<> &IRB,
2008                                unsigned Offset, unsigned AddressSpace) {
2009   return ConstantExpr::getIntToPtr(
2010       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2011       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2012 }
2013
2014 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2015   // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2016   // tcbhead_t; use it instead of the usual global variable (see
2017   // sysdeps/{i386,x86_64}/nptl/tls.h)
2018   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2019     if (Subtarget.isTargetFuchsia()) {
2020       // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2021       return SegmentOffset(IRB, 0x10, getAddressSpace());
2022     } else {
2023       // %fs:0x28, unless we're using a Kernel code model, in which case
2024       // it's %gs:0x28.  gs:0x14 on i386.
2025       unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2026       return SegmentOffset(IRB, Offset, getAddressSpace());
2027     }
2028   }
2029
2030   return TargetLowering::getIRStackGuard(IRB);
2031 }
2032
2033 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2034   // MSVC CRT provides functionalities for stack protection.
2035   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2036     // MSVC CRT has a global variable holding security cookie.
2037     M.getOrInsertGlobal("__security_cookie",
2038                         Type::getInt8PtrTy(M.getContext()));
2039
2040     // MSVC CRT has a function to validate security cookie.
2041     auto *SecurityCheckCookie = cast<Function>(
2042         M.getOrInsertFunction("__security_check_cookie",
2043                               Type::getVoidTy(M.getContext()),
2044                               Type::getInt8PtrTy(M.getContext())));
2045     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2046     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2047     return;
2048   }
2049   // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2050   if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2051     return;
2052   TargetLowering::insertSSPDeclarations(M);
2053 }
2054
2055 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2056   // MSVC CRT has a global variable holding security cookie.
2057   if (Subtarget.getTargetTriple().isOSMSVCRT())
2058     return M.getGlobalVariable("__security_cookie");
2059   return TargetLowering::getSDagStackGuard(M);
2060 }
2061
2062 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2063   // MSVC CRT has a function to validate security cookie.
2064   if (Subtarget.getTargetTriple().isOSMSVCRT())
2065     return M.getFunction("__security_check_cookie");
2066   return TargetLowering::getSSPStackGuardCheck(M);
2067 }
2068
2069 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2070   if (Subtarget.getTargetTriple().isOSContiki())
2071     return getDefaultSafeStackPointerLocation(IRB, false);
2072
2073   // Android provides a fixed TLS slot for the SafeStack pointer. See the
2074   // definition of TLS_SLOT_SAFESTACK in
2075   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2076   if (Subtarget.isTargetAndroid()) {
2077     // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2078     // %gs:0x24 on i386
2079     unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2080     return SegmentOffset(IRB, Offset, getAddressSpace());
2081   }
2082
2083   // Fuchsia is similar.
2084   if (Subtarget.isTargetFuchsia()) {
2085     // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2086     return SegmentOffset(IRB, 0x18, getAddressSpace());
2087   }
2088
2089   return TargetLowering::getSafeStackPointerLocation(IRB);
2090 }
2091
2092 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2093                                             unsigned DestAS) const {
2094   assert(SrcAS != DestAS && "Expected different address spaces!");
2095
2096   return SrcAS < 256 && DestAS < 256;
2097 }
2098
2099 //===----------------------------------------------------------------------===//
2100 //               Return Value Calling Convention Implementation
2101 //===----------------------------------------------------------------------===//
2102
2103 #include "X86GenCallingConv.inc"
2104
2105 bool X86TargetLowering::CanLowerReturn(
2106     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2107     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2108   SmallVector<CCValAssign, 16> RVLocs;
2109   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2110   return CCInfo.CheckReturn(Outs, RetCC_X86);
2111 }
2112
2113 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2114   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2115   return ScratchRegs;
2116 }
2117
2118 /// Lowers masks values (v*i1) to the local register values
2119 /// \returns DAG node after lowering to register type
2120 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2121                                const SDLoc &Dl, SelectionDAG &DAG) {
2122   EVT ValVT = ValArg.getValueType();
2123
2124   if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2125       (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2126     // Two stage lowering might be required
2127     // bitcast:   v8i1 -> i8 / v16i1 -> i16
2128     // anyextend: i8   -> i32 / i16   -> i32
2129     EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2130     SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2131     if (ValLoc == MVT::i32)
2132       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2133     return ValToCopy;
2134   } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2135              (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2136     // One stage lowering is required
2137     // bitcast:   v32i1 -> i32 / v64i1 -> i64
2138     return DAG.getBitcast(ValLoc, ValArg);
2139   } else
2140     return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2141 }
2142
2143 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2144 static void Passv64i1ArgInRegs(
2145     const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2146     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2147     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2148   assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2149          "Expected AVX512BW or AVX512BMI target!");
2150   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2151   assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2152   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2153          "The value should reside in two registers");
2154
2155   // Before splitting the value we cast it to i64
2156   Arg = DAG.getBitcast(MVT::i64, Arg);
2157
2158   // Splitting the value into two i32 types
2159   SDValue Lo, Hi;
2160   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2161                    DAG.getConstant(0, Dl, MVT::i32));
2162   Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2163                    DAG.getConstant(1, Dl, MVT::i32));
2164
2165   // Attach the two i32 types into corresponding registers
2166   RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2167   RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2168 }
2169
2170 SDValue
2171 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2172                                bool isVarArg,
2173                                const SmallVectorImpl<ISD::OutputArg> &Outs,
2174                                const SmallVectorImpl<SDValue> &OutVals,
2175                                const SDLoc &dl, SelectionDAG &DAG) const {
2176   MachineFunction &MF = DAG.getMachineFunction();
2177   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2178
2179   // In some cases we need to disable registers from the default CSR list.
2180   // For example, when they are used for argument passing.
2181   bool ShouldDisableCalleeSavedRegister =
2182       CallConv == CallingConv::X86_RegCall ||
2183       MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2184
2185   if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2186     report_fatal_error("X86 interrupts may not return any value");
2187
2188   SmallVector<CCValAssign, 16> RVLocs;
2189   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2190   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2191
2192   SDValue Flag;
2193   SmallVector<SDValue, 6> RetOps;
2194   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2195   // Operand #1 = Bytes To Pop
2196   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2197                    MVT::i32));
2198
2199   // Copy the result values into the output registers.
2200   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2201        ++I, ++OutsIndex) {
2202     CCValAssign &VA = RVLocs[I];
2203     assert(VA.isRegLoc() && "Can only return in registers!");
2204
2205     // Add the register to the CalleeSaveDisableRegs list.
2206     if (ShouldDisableCalleeSavedRegister)
2207       MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2208
2209     SDValue ValToCopy = OutVals[OutsIndex];
2210     EVT ValVT = ValToCopy.getValueType();
2211
2212     // Promote values to the appropriate types.
2213     if (VA.getLocInfo() == CCValAssign::SExt)
2214       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2215     else if (VA.getLocInfo() == CCValAssign::ZExt)
2216       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2217     else if (VA.getLocInfo() == CCValAssign::AExt) {
2218       if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2219         ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2220       else
2221         ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2222     }
2223     else if (VA.getLocInfo() == CCValAssign::BCvt)
2224       ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2225
2226     assert(VA.getLocInfo() != CCValAssign::FPExt &&
2227            "Unexpected FP-extend for return value.");
2228
2229     // If this is x86-64, and we disabled SSE, we can't return FP values,
2230     // or SSE or MMX vectors.
2231     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2232          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2233         (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2234       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2235       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2236     } else if (ValVT == MVT::f64 &&
2237                (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2238       // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
2239       // llvm-gcc has never done it right and no one has noticed, so this
2240       // should be OK for now.
2241       errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2242       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2243     }
2244
2245     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2246     // the RET instruction and handled by the FP Stackifier.
2247     if (VA.getLocReg() == X86::FP0 ||
2248         VA.getLocReg() == X86::FP1) {
2249       // If this is a copy from an xmm register to ST(0), use an FPExtend to
2250       // change the value to the FP stack register class.
2251       if (isScalarFPTypeInSSEReg(VA.getValVT()))
2252         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2253       RetOps.push_back(ValToCopy);
2254       // Don't emit a copytoreg.
2255       continue;
2256     }
2257
2258     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2259     // which is returned in RAX / RDX.
2260     if (Subtarget.is64Bit()) {
2261       if (ValVT == MVT::x86mmx) {
2262         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2263           ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2264           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2265                                   ValToCopy);
2266           // If we don't have SSE2 available, convert to v4f32 so the generated
2267           // register is legal.
2268           if (!Subtarget.hasSSE2())
2269             ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2270         }
2271       }
2272     }
2273
2274     SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2275
2276     if (VA.needsCustom()) {
2277       assert(VA.getValVT() == MVT::v64i1 &&
2278              "Currently the only custom case is when we split v64i1 to 2 regs");
2279
2280       Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2281                          Subtarget);
2282
2283       assert(2 == RegsToPass.size() &&
2284              "Expecting two registers after Pass64BitArgInRegs");
2285
2286       // Add the second register to the CalleeSaveDisableRegs list.
2287       if (ShouldDisableCalleeSavedRegister)
2288         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2289     } else {
2290       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2291     }
2292
2293     // Add nodes to the DAG and add the values into the RetOps list
2294     for (auto &Reg : RegsToPass) {
2295       Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2296       Flag = Chain.getValue(1);
2297       RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2298     }
2299   }
2300
2301   // Swift calling convention does not require we copy the sret argument
2302   // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2303
2304   // All x86 ABIs require that for returning structs by value we copy
2305   // the sret argument into %rax/%eax (depending on ABI) for the return.
2306   // We saved the argument into a virtual register in the entry block,
2307   // so now we copy the value out and into %rax/%eax.
2308   //
2309   // Checking Function.hasStructRetAttr() here is insufficient because the IR
2310   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2311   // false, then an sret argument may be implicitly inserted in the SelDAG. In
2312   // either case FuncInfo->setSRetReturnReg() will have been called.
2313   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2314     // When we have both sret and another return value, we should use the
2315     // original Chain stored in RetOps[0], instead of the current Chain updated
2316     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2317
2318     // For the case of sret and another return value, we have
2319     //   Chain_0 at the function entry
2320     //   Chain_1 = getCopyToReg(Chain_0) in the above loop
2321     // If we use Chain_1 in getCopyFromReg, we will have
2322     //   Val = getCopyFromReg(Chain_1)
2323     //   Chain_2 = getCopyToReg(Chain_1, Val) from below
2324
2325     // getCopyToReg(Chain_0) will be glued together with
2326     // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2327     // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2328     //   Data dependency from Unit B to Unit A due to usage of Val in
2329     //     getCopyToReg(Chain_1, Val)
2330     //   Chain dependency from Unit A to Unit B
2331
2332     // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2333     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2334                                      getPointerTy(MF.getDataLayout()));
2335
2336     unsigned RetValReg
2337         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2338           X86::RAX : X86::EAX;
2339     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2340     Flag = Chain.getValue(1);
2341
2342     // RAX/EAX now acts like a return value.
2343     RetOps.push_back(
2344         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2345
2346     // Add the returned register to the CalleeSaveDisableRegs list.
2347     if (ShouldDisableCalleeSavedRegister)
2348       MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2349   }
2350
2351   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2352   const MCPhysReg *I =
2353       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2354   if (I) {
2355     for (; *I; ++I) {
2356       if (X86::GR64RegClass.contains(*I))
2357         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2358       else
2359         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2360     }
2361   }
2362
2363   RetOps[0] = Chain;  // Update chain.
2364
2365   // Add the flag if we have it.
2366   if (Flag.getNode())
2367     RetOps.push_back(Flag);
2368
2369   X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2370   if (CallConv == CallingConv::X86_INTR)
2371     opcode = X86ISD::IRET;
2372   return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2373 }
2374
2375 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2376   if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2377     return false;
2378
2379   SDValue TCChain = Chain;
2380   SDNode *Copy = *N->use_begin();
2381   if (Copy->getOpcode() == ISD::CopyToReg) {
2382     // If the copy has a glue operand, we conservatively assume it isn't safe to
2383     // perform a tail call.
2384     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2385       return false;
2386     TCChain = Copy->getOperand(0);
2387   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2388     return false;
2389
2390   bool HasRet = false;
2391   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2392        UI != UE; ++UI) {
2393     if (UI->getOpcode() != X86ISD::RET_FLAG)
2394       return false;
2395     // If we are returning more than one value, we can definitely
2396     // not make a tail call see PR19530
2397     if (UI->getNumOperands() > 4)
2398       return false;
2399     if (UI->getNumOperands() == 4 &&
2400         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2401       return false;
2402     HasRet = true;
2403   }
2404
2405   if (!HasRet)
2406     return false;
2407
2408   Chain = TCChain;
2409   return true;
2410 }
2411
2412 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2413                                            ISD::NodeType ExtendKind) const {
2414   MVT ReturnMVT = MVT::i32;
2415
2416   bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2417   if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2418     // The ABI does not require i1, i8 or i16 to be extended.
2419     //
2420     // On Darwin, there is code in the wild relying on Clang's old behaviour of
2421     // always extending i8/i16 return values, so keep doing that for now.
2422     // (PR26665).
2423     ReturnMVT = MVT::i8;
2424   }
2425
2426   EVT MinVT = getRegisterType(Context, ReturnMVT);
2427   return VT.bitsLT(MinVT) ? MinVT : VT;
2428 }
2429
2430 /// Reads two 32 bit registers and creates a 64 bit mask value.
2431 /// \param VA The current 32 bit value that need to be assigned.
2432 /// \param NextVA The next 32 bit value that need to be assigned.
2433 /// \param Root The parent DAG node.
2434 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2435 ///                        glue purposes. In the case the DAG is already using
2436 ///                        physical register instead of virtual, we should glue
2437 ///                        our new SDValue to InFlag SDvalue.
2438 /// \return a new SDvalue of size 64bit.
2439 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2440                                 SDValue &Root, SelectionDAG &DAG,
2441                                 const SDLoc &Dl, const X86Subtarget &Subtarget,
2442                                 SDValue *InFlag = nullptr) {
2443   assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2444   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2445   assert(VA.getValVT() == MVT::v64i1 &&
2446          "Expecting first location of 64 bit width type");
2447   assert(NextVA.getValVT() == VA.getValVT() &&
2448          "The locations should have the same type");
2449   assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2450          "The values should reside in two registers");
2451
2452   SDValue Lo, Hi;
2453   unsigned Reg;
2454   SDValue ArgValueLo, ArgValueHi;
2455
2456   MachineFunction &MF = DAG.getMachineFunction();
2457   const TargetRegisterClass *RC = &X86::GR32RegClass;
2458
2459   // Read a 32 bit value from the registers
2460   if (nullptr == InFlag) {
2461     // When no physical register is present,
2462     // create an intermediate virtual register
2463     Reg = MF.addLiveIn(VA.getLocReg(), RC);
2464     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2465     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2466     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2467   } else {
2468     // When a physical register is available read the value from it and glue
2469     // the reads together.
2470     ArgValueLo =
2471       DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2472     *InFlag = ArgValueLo.getValue(2);
2473     ArgValueHi =
2474       DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2475     *InFlag = ArgValueHi.getValue(2);
2476   }
2477
2478   // Convert the i32 type into v32i1 type
2479   Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2480
2481   // Convert the i32 type into v32i1 type
2482   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2483
2484   // Concatenate the two values together
2485   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2486 }
2487
2488 /// The function will lower a register of various sizes (8/16/32/64)
2489 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2490 /// \returns a DAG node contains the operand after lowering to mask type.
2491 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2492                                const EVT &ValLoc, const SDLoc &Dl,
2493                                SelectionDAG &DAG) {
2494   SDValue ValReturned = ValArg;
2495
2496   if (ValVT == MVT::v1i1)
2497     return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2498
2499   if (ValVT == MVT::v64i1) {
2500     // In 32 bit machine, this case is handled by getv64i1Argument
2501     assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2502     // In 64 bit machine, There is no need to truncate the value only bitcast
2503   } else {
2504     MVT maskLen;
2505     switch (ValVT.getSimpleVT().SimpleTy) {
2506     case MVT::v8i1:
2507       maskLen = MVT::i8;
2508       break;
2509     case MVT::v16i1:
2510       maskLen = MVT::i16;
2511       break;
2512     case MVT::v32i1:
2513       maskLen = MVT::i32;
2514       break;
2515     default:
2516       llvm_unreachable("Expecting a vector of i1 types");
2517     }
2518
2519     ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2520   }
2521   return DAG.getBitcast(ValVT, ValReturned);
2522 }
2523
2524 /// Lower the result values of a call into the
2525 /// appropriate copies out of appropriate physical registers.
2526 ///
2527 SDValue X86TargetLowering::LowerCallResult(
2528     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2529     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2530     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2531     uint32_t *RegMask) const {
2532
2533   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2534   // Assign locations to each value returned by this call.
2535   SmallVector<CCValAssign, 16> RVLocs;
2536   bool Is64Bit = Subtarget.is64Bit();
2537   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2538                  *DAG.getContext());
2539   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2540
2541   // Copy all of the result registers out of their specified physreg.
2542   for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2543        ++I, ++InsIndex) {
2544     CCValAssign &VA = RVLocs[I];
2545     EVT CopyVT = VA.getLocVT();
2546
2547     // In some calling conventions we need to remove the used registers
2548     // from the register mask.
2549     if (RegMask) {
2550       for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2551            SubRegs.isValid(); ++SubRegs)
2552         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2553     }
2554
2555     // If this is x86-64, and we disabled SSE, we can't return FP values
2556     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2557         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2558       errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2559       VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2560     }
2561
2562     // If we prefer to use the value in xmm registers, copy it out as f80 and
2563     // use a truncate to move it from fp stack reg to xmm reg.
2564     bool RoundAfterCopy = false;
2565     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2566         isScalarFPTypeInSSEReg(VA.getValVT())) {
2567       if (!Subtarget.hasX87())
2568         report_fatal_error("X87 register return with X87 disabled");
2569       CopyVT = MVT::f80;
2570       RoundAfterCopy = (CopyVT != VA.getLocVT());
2571     }
2572
2573     SDValue Val;
2574     if (VA.needsCustom()) {
2575       assert(VA.getValVT() == MVT::v64i1 &&
2576              "Currently the only custom case is when we split v64i1 to 2 regs");
2577       Val =
2578           getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2579     } else {
2580       Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2581                   .getValue(1);
2582       Val = Chain.getValue(0);
2583       InFlag = Chain.getValue(2);
2584     }
2585
2586     if (RoundAfterCopy)
2587       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2588                         // This truncation won't change the value.
2589                         DAG.getIntPtrConstant(1, dl));
2590
2591     if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2592       if (VA.getValVT().isVector() &&
2593           ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2594            (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2595         // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2596         Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2597       } else
2598         Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2599     }
2600
2601     InVals.push_back(Val);
2602   }
2603
2604   return Chain;
2605 }
2606
2607 //===----------------------------------------------------------------------===//
2608 //                C & StdCall & Fast Calling Convention implementation
2609 //===----------------------------------------------------------------------===//
2610 //  StdCall calling convention seems to be standard for many Windows' API
2611 //  routines and around. It differs from C calling convention just a little:
2612 //  callee should clean up the stack, not caller. Symbols should be also
2613 //  decorated in some fancy way :) It doesn't support any vector arguments.
2614 //  For info on fast calling convention see Fast Calling Convention (tail call)
2615 //  implementation LowerX86_32FastCCCallTo.
2616
2617 /// CallIsStructReturn - Determines whether a call uses struct return
2618 /// semantics.
2619 enum StructReturnType {
2620   NotStructReturn,
2621   RegStructReturn,
2622   StackStructReturn
2623 };
2624 static StructReturnType
2625 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2626   if (Outs.empty())
2627     return NotStructReturn;
2628
2629   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2630   if (!Flags.isSRet())
2631     return NotStructReturn;
2632   if (Flags.isInReg() || IsMCU)
2633     return RegStructReturn;
2634   return StackStructReturn;
2635 }
2636
2637 /// Determines whether a function uses struct return semantics.
2638 static StructReturnType
2639 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2640   if (Ins.empty())
2641     return NotStructReturn;
2642
2643   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2644   if (!Flags.isSRet())
2645     return NotStructReturn;
2646   if (Flags.isInReg() || IsMCU)
2647     return RegStructReturn;
2648   return StackStructReturn;
2649 }
2650
2651 /// Make a copy of an aggregate at address specified by "Src" to address
2652 /// "Dst" with size and alignment information specified by the specific
2653 /// parameter attribute. The copy will be passed as a byval function parameter.
2654 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2655                                          SDValue Chain, ISD::ArgFlagsTy Flags,
2656                                          SelectionDAG &DAG, const SDLoc &dl) {
2657   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2658
2659   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2660                        /*isVolatile*/false, /*AlwaysInline=*/true,
2661                        /*isTailCall*/false,
2662                        MachinePointerInfo(), MachinePointerInfo());
2663 }
2664
2665 /// Return true if the calling convention is one that we can guarantee TCO for.
2666 static bool canGuaranteeTCO(CallingConv::ID CC) {
2667   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2668           CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2669           CC == CallingConv::HHVM);
2670 }
2671
2672 /// Return true if we might ever do TCO for calls with this calling convention.
2673 static bool mayTailCallThisCC(CallingConv::ID CC) {
2674   switch (CC) {
2675   // C calling conventions:
2676   case CallingConv::C:
2677   case CallingConv::Win64:
2678   case CallingConv::X86_64_SysV:
2679   // Callee pop conventions:
2680   case CallingConv::X86_ThisCall:
2681   case CallingConv::X86_StdCall:
2682   case CallingConv::X86_VectorCall:
2683   case CallingConv::X86_FastCall:
2684     return true;
2685   default:
2686     return canGuaranteeTCO(CC);
2687   }
2688 }
2689
2690 /// Return true if the function is being made into a tailcall target by
2691 /// changing its ABI.
2692 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2693   return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2694 }
2695
2696 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2697   auto Attr =
2698       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2699   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2700     return false;
2701
2702   ImmutableCallSite CS(CI);
2703   CallingConv::ID CalleeCC = CS.getCallingConv();
2704   if (!mayTailCallThisCC(CalleeCC))
2705     return false;
2706
2707   return true;
2708 }
2709
2710 SDValue
2711 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2712                                     const SmallVectorImpl<ISD::InputArg> &Ins,
2713                                     const SDLoc &dl, SelectionDAG &DAG,
2714                                     const CCValAssign &VA,
2715                                     MachineFrameInfo &MFI, unsigned i) const {
2716   // Create the nodes corresponding to a load from this parameter slot.
2717   ISD::ArgFlagsTy Flags = Ins[i].Flags;
2718   bool AlwaysUseMutable = shouldGuaranteeTCO(
2719       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2720   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2721   EVT ValVT;
2722   MVT PtrVT = getPointerTy(DAG.getDataLayout());
2723
2724   // If value is passed by pointer we have address passed instead of the value
2725   // itself. No need to extend if the mask value and location share the same
2726   // absolute size.
2727   bool ExtendedInMem =
2728       VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2729       VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2730
2731   if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2732     ValVT = VA.getLocVT();
2733   else
2734     ValVT = VA.getValVT();
2735
2736   // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2737   // taken by a return address.
2738   int Offset = 0;
2739   if (CallConv == CallingConv::X86_INTR) {
2740     // X86 interrupts may take one or two arguments.
2741     // On the stack there will be no return address as in regular call.
2742     // Offset of last argument need to be set to -4/-8 bytes.
2743     // Where offset of the first argument out of two, should be set to 0 bytes.
2744     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2745     if (Subtarget.is64Bit() && Ins.size() == 2) {
2746       // The stack pointer needs to be realigned for 64 bit handlers with error
2747       // code, so the argument offset changes by 8 bytes.
2748       Offset += 8;
2749     }
2750   }
2751
2752   // FIXME: For now, all byval parameter objects are marked mutable. This can be
2753   // changed with more analysis.
2754   // In case of tail call optimization mark all arguments mutable. Since they
2755   // could be overwritten by lowering of arguments in case of a tail call.
2756   if (Flags.isByVal()) {
2757     unsigned Bytes = Flags.getByValSize();
2758     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2759     int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2760     // Adjust SP offset of interrupt parameter.
2761     if (CallConv == CallingConv::X86_INTR) {
2762       MFI.setObjectOffset(FI, Offset);
2763     }
2764     return DAG.getFrameIndex(FI, PtrVT);
2765   }
2766
2767   // This is an argument in memory. We might be able to perform copy elision.
2768   if (Flags.isCopyElisionCandidate()) {
2769     EVT ArgVT = Ins[i].ArgVT;
2770     SDValue PartAddr;
2771     if (Ins[i].PartOffset == 0) {
2772       // If this is a one-part value or the first part of a multi-part value,
2773       // create a stack object for the entire argument value type and return a
2774       // load from our portion of it. This assumes that if the first part of an
2775       // argument is in memory, the rest will also be in memory.
2776       int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2777                                      /*Immutable=*/false);
2778       PartAddr = DAG.getFrameIndex(FI, PtrVT);
2779       return DAG.getLoad(
2780           ValVT, dl, Chain, PartAddr,
2781           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2782     } else {
2783       // This is not the first piece of an argument in memory. See if there is
2784       // already a fixed stack object including this offset. If so, assume it
2785       // was created by the PartOffset == 0 branch above and create a load from
2786       // the appropriate offset into it.
2787       int64_t PartBegin = VA.getLocMemOffset();
2788       int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2789       int FI = MFI.getObjectIndexBegin();
2790       for (; MFI.isFixedObjectIndex(FI); ++FI) {
2791         int64_t ObjBegin = MFI.getObjectOffset(FI);
2792         int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2793         if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2794           break;
2795       }
2796       if (MFI.isFixedObjectIndex(FI)) {
2797         SDValue Addr =
2798             DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2799                         DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2800         return DAG.getLoad(
2801             ValVT, dl, Chain, Addr,
2802             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2803                                               Ins[i].PartOffset));
2804       }
2805     }
2806   }
2807
2808   int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2809                                  VA.getLocMemOffset(), isImmutable);
2810
2811   // Set SExt or ZExt flag.
2812   if (VA.getLocInfo() == CCValAssign::ZExt) {
2813     MFI.setObjectZExt(FI, true);
2814   } else if (VA.getLocInfo() == CCValAssign::SExt) {
2815     MFI.setObjectSExt(FI, true);
2816   }
2817
2818   // Adjust SP offset of interrupt parameter.
2819   if (CallConv == CallingConv::X86_INTR) {
2820     MFI.setObjectOffset(FI, Offset);
2821   }
2822
2823   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2824   SDValue Val = DAG.getLoad(
2825       ValVT, dl, Chain, FIN,
2826       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2827   return ExtendedInMem
2828              ? (VA.getValVT().isVector()
2829                     ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2830                     : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2831              : Val;
2832 }
2833
2834 // FIXME: Get this from tablegen.
2835 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2836                                                 const X86Subtarget &Subtarget) {
2837   assert(Subtarget.is64Bit());
2838
2839   if (Subtarget.isCallingConvWin64(CallConv)) {
2840     static const MCPhysReg GPR64ArgRegsWin64[] = {
2841       X86::RCX, X86::RDX, X86::R8,  X86::R9
2842     };
2843     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2844   }
2845
2846   static const MCPhysReg GPR64ArgRegs64Bit[] = {
2847     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2848   };
2849   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2850 }
2851
2852 // FIXME: Get this from tablegen.
2853 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2854                                                 CallingConv::ID CallConv,
2855                                                 const X86Subtarget &Subtarget) {
2856   assert(Subtarget.is64Bit());
2857   if (Subtarget.isCallingConvWin64(CallConv)) {
2858     // The XMM registers which might contain var arg parameters are shadowed
2859     // in their paired GPR.  So we only need to save the GPR to their home
2860     // slots.
2861     // TODO: __vectorcall will change this.
2862     return None;
2863   }
2864
2865   const Function *Fn = MF.getFunction();
2866   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2867   bool isSoftFloat = Subtarget.useSoftFloat();
2868   assert(!(isSoftFloat && NoImplicitFloatOps) &&
2869          "SSE register cannot be used when SSE is disabled!");
2870   if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2871     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2872     // registers.
2873     return None;
2874
2875   static const MCPhysReg XMMArgRegs64Bit[] = {
2876     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2877     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2878   };
2879   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2880 }
2881
2882 #ifndef NDEBUG
2883 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2884   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2885                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
2886                           return A.getValNo() < B.getValNo();
2887                         });
2888 }
2889 #endif
2890
2891 SDValue X86TargetLowering::LowerFormalArguments(
2892     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2893     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2894     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2895   MachineFunction &MF = DAG.getMachineFunction();
2896   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2897   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2898
2899   const Function *Fn = MF.getFunction();
2900   if (Fn->hasExternalLinkage() &&
2901       Subtarget.isTargetCygMing() &&
2902       Fn->getName() == "main")
2903     FuncInfo->setForceFramePointer(true);
2904
2905   MachineFrameInfo &MFI = MF.getFrameInfo();
2906   bool Is64Bit = Subtarget.is64Bit();
2907   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2908
2909   assert(
2910       !(isVarArg && canGuaranteeTCO(CallConv)) &&
2911       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2912
2913   if (CallConv == CallingConv::X86_INTR) {
2914     bool isLegal = Ins.size() == 1 ||
2915                    (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2916                                         (!Is64Bit && Ins[1].VT == MVT::i32)));
2917     if (!isLegal)
2918       report_fatal_error("X86 interrupts may take one or two arguments");
2919   }
2920
2921   // Assign locations to all of the incoming arguments.
2922   SmallVector<CCValAssign, 16> ArgLocs;
2923   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2924
2925   // Allocate shadow area for Win64.
2926   if (IsWin64)
2927     CCInfo.AllocateStack(32, 8);
2928
2929   CCInfo.AnalyzeArguments(Ins, CC_X86);
2930
2931   // In vectorcall calling convention a second pass is required for the HVA
2932   // types.
2933   if (CallingConv::X86_VectorCall == CallConv) {
2934     CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2935   }
2936
2937   // The next loop assumes that the locations are in the same order of the
2938   // input arguments.
2939   assert(isSortedByValueNo(ArgLocs) &&
2940          "Argument Location list must be sorted before lowering");
2941
2942   SDValue ArgValue;
2943   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2944        ++I, ++InsIndex) {
2945     assert(InsIndex < Ins.size() && "Invalid Ins index");
2946     CCValAssign &VA = ArgLocs[I];
2947
2948     if (VA.isRegLoc()) {
2949       EVT RegVT = VA.getLocVT();
2950       if (VA.needsCustom()) {
2951         assert(
2952             VA.getValVT() == MVT::v64i1 &&
2953             "Currently the only custom case is when we split v64i1 to 2 regs");
2954
2955         // v64i1 values, in regcall calling convention, that are
2956         // compiled to 32 bit arch, are split up into two registers.
2957         ArgValue =
2958             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2959       } else {
2960         const TargetRegisterClass *RC;
2961         if (RegVT == MVT::i32)
2962           RC = &X86::GR32RegClass;
2963         else if (Is64Bit && RegVT == MVT::i64)
2964           RC = &X86::GR64RegClass;
2965         else if (RegVT == MVT::f32)
2966           RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2967         else if (RegVT == MVT::f64)
2968           RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2969         else if (RegVT == MVT::f80)
2970           RC = &X86::RFP80RegClass;
2971         else if (RegVT == MVT::f128)
2972           RC = &X86::FR128RegClass;
2973         else if (RegVT.is512BitVector())
2974           RC = &X86::VR512RegClass;
2975         else if (RegVT.is256BitVector())
2976           RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2977         else if (RegVT.is128BitVector())
2978           RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2979         else if (RegVT == MVT::x86mmx)
2980           RC = &X86::VR64RegClass;
2981         else if (RegVT == MVT::v1i1)
2982           RC = &X86::VK1RegClass;
2983         else if (RegVT == MVT::v8i1)
2984           RC = &X86::VK8RegClass;
2985         else if (RegVT == MVT::v16i1)
2986           RC = &X86::VK16RegClass;
2987         else if (RegVT == MVT::v32i1)
2988           RC = &X86::VK32RegClass;
2989         else if (RegVT == MVT::v64i1)
2990           RC = &X86::VK64RegClass;
2991         else
2992           llvm_unreachable("Unknown argument type!");
2993
2994         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2995         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2996       }
2997
2998       // If this is an 8 or 16-bit value, it is really passed promoted to 32
2999       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
3000       // right size.
3001       if (VA.getLocInfo() == CCValAssign::SExt)
3002         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3003                                DAG.getValueType(VA.getValVT()));
3004       else if (VA.getLocInfo() == CCValAssign::ZExt)
3005         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3006                                DAG.getValueType(VA.getValVT()));
3007       else if (VA.getLocInfo() == CCValAssign::BCvt)
3008         ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3009
3010       if (VA.isExtInLoc()) {
3011         // Handle MMX values passed in XMM regs.
3012         if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3013           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3014         else if (VA.getValVT().isVector() &&
3015                  VA.getValVT().getScalarType() == MVT::i1 &&
3016                  ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3017                   (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3018           // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3019           ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3020         } else
3021           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3022       }
3023     } else {
3024       assert(VA.isMemLoc());
3025       ArgValue =
3026           LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3027     }
3028
3029     // If value is passed via pointer - do a load.
3030     if (VA.getLocInfo() == CCValAssign::Indirect)
3031       ArgValue =
3032           DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3033
3034     InVals.push_back(ArgValue);
3035   }
3036
3037   for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3038     // Swift calling convention does not require we copy the sret argument
3039     // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3040     if (CallConv == CallingConv::Swift)
3041       continue;
3042
3043     // All x86 ABIs require that for returning structs by value we copy the
3044     // sret argument into %rax/%eax (depending on ABI) for the return. Save
3045     // the argument into a virtual register so that we can access it from the
3046     // return points.
3047     if (Ins[I].Flags.isSRet()) {
3048       unsigned Reg = FuncInfo->getSRetReturnReg();
3049       if (!Reg) {
3050         MVT PtrTy = getPointerTy(DAG.getDataLayout());
3051         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3052         FuncInfo->setSRetReturnReg(Reg);
3053       }
3054       SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3055       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3056       break;
3057     }
3058   }
3059
3060   unsigned StackSize = CCInfo.getNextStackOffset();
3061   // Align stack specially for tail calls.
3062   if (shouldGuaranteeTCO(CallConv,
3063                          MF.getTarget().Options.GuaranteedTailCallOpt))
3064     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3065
3066   // If the function takes variable number of arguments, make a frame index for
3067   // the start of the first vararg value... for expansion of llvm.va_start. We
3068   // can skip this if there are no va_start calls.
3069   if (MFI.hasVAStart() &&
3070       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3071                    CallConv != CallingConv::X86_ThisCall))) {
3072     FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3073   }
3074
3075   // Figure out if XMM registers are in use.
3076   assert(!(Subtarget.useSoftFloat() &&
3077            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3078          "SSE register cannot be used when SSE is disabled!");
3079
3080   // 64-bit calling conventions support varargs and register parameters, so we
3081   // have to do extra work to spill them in the prologue.
3082   if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3083     // Find the first unallocated argument registers.
3084     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3085     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3086     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3087     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3088     assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3089            "SSE register cannot be used when SSE is disabled!");
3090
3091     // Gather all the live in physical registers.
3092     SmallVector<SDValue, 6> LiveGPRs;
3093     SmallVector<SDValue, 8> LiveXMMRegs;
3094     SDValue ALVal;
3095     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3096       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3097       LiveGPRs.push_back(
3098           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3099     }
3100     if (!ArgXMMs.empty()) {
3101       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3102       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3103       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3104         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3105         LiveXMMRegs.push_back(
3106             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3107       }
3108     }
3109
3110     if (IsWin64) {
3111       // Get to the caller-allocated home save location.  Add 8 to account
3112       // for the return address.
3113       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3114       FuncInfo->setRegSaveFrameIndex(
3115           MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3116       // Fixup to set vararg frame on shadow area (4 x i64).
3117       if (NumIntRegs < 4)
3118         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3119     } else {
3120       // For X86-64, if there are vararg parameters that are passed via
3121       // registers, then we must store them to their spots on the stack so
3122       // they may be loaded by dereferencing the result of va_next.
3123       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3124       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3125       FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3126           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3127     }
3128
3129     // Store the integer parameter registers.
3130     SmallVector<SDValue, 8> MemOps;
3131     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3132                                       getPointerTy(DAG.getDataLayout()));
3133     unsigned Offset = FuncInfo->getVarArgsGPOffset();
3134     for (SDValue Val : LiveGPRs) {
3135       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3136                                 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3137       SDValue Store =
3138           DAG.getStore(Val.getValue(1), dl, Val, FIN,
3139                        MachinePointerInfo::getFixedStack(
3140                            DAG.getMachineFunction(),
3141                            FuncInfo->getRegSaveFrameIndex(), Offset));
3142       MemOps.push_back(Store);
3143       Offset += 8;
3144     }
3145
3146     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3147       // Now store the XMM (fp + vector) parameter registers.
3148       SmallVector<SDValue, 12> SaveXMMOps;
3149       SaveXMMOps.push_back(Chain);
3150       SaveXMMOps.push_back(ALVal);
3151       SaveXMMOps.push_back(DAG.getIntPtrConstant(
3152                              FuncInfo->getRegSaveFrameIndex(), dl));
3153       SaveXMMOps.push_back(DAG.getIntPtrConstant(
3154                              FuncInfo->getVarArgsFPOffset(), dl));
3155       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3156                         LiveXMMRegs.end());
3157       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3158                                    MVT::Other, SaveXMMOps));
3159     }
3160
3161     if (!MemOps.empty())
3162       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3163   }
3164
3165   if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3166     // Find the largest legal vector type.
3167     MVT VecVT = MVT::Other;
3168     // FIXME: Only some x86_32 calling conventions support AVX512.
3169     if (Subtarget.hasAVX512() &&
3170         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3171                      CallConv == CallingConv::Intel_OCL_BI)))
3172       VecVT = MVT::v16f32;
3173     else if (Subtarget.hasAVX())
3174       VecVT = MVT::v8f32;
3175     else if (Subtarget.hasSSE2())
3176       VecVT = MVT::v4f32;
3177
3178     // We forward some GPRs and some vector types.
3179     SmallVector<MVT, 2> RegParmTypes;
3180     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3181     RegParmTypes.push_back(IntVT);
3182     if (VecVT != MVT::Other)
3183       RegParmTypes.push_back(VecVT);
3184
3185     // Compute the set of forwarded registers. The rest are scratch.
3186     SmallVectorImpl<ForwardedRegister> &Forwards =
3187         FuncInfo->getForwardedMustTailRegParms();
3188     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3189
3190     // Conservatively forward AL on x86_64, since it might be used for varargs.
3191     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3192       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3193       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3194     }
3195
3196     // Copy all forwards from physical to virtual registers.
3197     for (ForwardedRegister &F : Forwards) {
3198       // FIXME: Can we use a less constrained schedule?
3199       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3200       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3201       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3202     }
3203   }
3204
3205   // Some CCs need callee pop.
3206   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3207                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
3208     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3209   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3210     // X86 interrupts must pop the error code (and the alignment padding) if
3211     // present.
3212     FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3213   } else {
3214     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3215     // If this is an sret function, the return should pop the hidden pointer.
3216     if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3217         !Subtarget.getTargetTriple().isOSMSVCRT() &&
3218         argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3219       FuncInfo->setBytesToPopOnReturn(4);
3220   }
3221
3222   if (!Is64Bit) {
3223     // RegSaveFrameIndex is X86-64 only.
3224     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3225     if (CallConv == CallingConv::X86_FastCall ||
3226         CallConv == CallingConv::X86_ThisCall)
3227       // fastcc functions can't have varargs.
3228       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3229   }
3230
3231   FuncInfo->setArgumentStackSize(StackSize);
3232
3233   if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3234     EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3235     if (Personality == EHPersonality::CoreCLR) {
3236       assert(Is64Bit);
3237       // TODO: Add a mechanism to frame lowering that will allow us to indicate
3238       // that we'd prefer this slot be allocated towards the bottom of the frame
3239       // (i.e. near the stack pointer after allocating the frame).  Every
3240       // funclet needs a copy of this slot in its (mostly empty) frame, and the
3241       // offset from the bottom of this and each funclet's frame must be the
3242       // same, so the size of funclets' (mostly empty) frames is dictated by
3243       // how far this slot is from the bottom (since they allocate just enough
3244       // space to accommodate holding this slot at the correct offset).
3245       int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3246       EHInfo->PSPSymFrameIdx = PSPSymFI;
3247     }
3248   }
3249
3250   if (CallConv == CallingConv::X86_RegCall ||
3251       Fn->hasFnAttribute("no_caller_saved_registers")) {
3252     const MachineRegisterInfo &MRI = MF.getRegInfo();
3253     for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3254       MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3255   }
3256
3257   return Chain;
3258 }
3259
3260 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3261                                             SDValue Arg, const SDLoc &dl,
3262                                             SelectionDAG &DAG,
3263                                             const CCValAssign &VA,
3264                                             ISD::ArgFlagsTy Flags) const {
3265   unsigned LocMemOffset = VA.getLocMemOffset();
3266   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3267   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3268                        StackPtr, PtrOff);
3269   if (Flags.isByVal())
3270     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3271
3272   return DAG.getStore(
3273       Chain, dl, Arg, PtrOff,
3274       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3275 }
3276
3277 /// Emit a load of return address if tail call
3278 /// optimization is performed and it is required.
3279 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3280     SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3281     bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3282   // Adjust the Return address stack slot.
3283   EVT VT = getPointerTy(DAG.getDataLayout());
3284   OutRetAddr = getReturnAddressFrameIndex(DAG);
3285
3286   // Load the "old" Return address.
3287   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3288   return SDValue(OutRetAddr.getNode(), 1);
3289 }
3290
3291 /// Emit a store of the return address if tail call
3292 /// optimization is performed and it is required (FPDiff!=0).
3293 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3294                                         SDValue Chain, SDValue RetAddrFrIdx,
3295                                         EVT PtrVT, unsigned SlotSize,
3296                                         int FPDiff, const SDLoc &dl) {
3297   // Store the return address to the appropriate stack slot.
3298   if (!FPDiff) return Chain;
3299   // Calculate the new stack slot for the return address.
3300   int NewReturnAddrFI =
3301     MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3302                                          false);
3303   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3304   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3305                        MachinePointerInfo::getFixedStack(
3306                            DAG.getMachineFunction(), NewReturnAddrFI));
3307   return Chain;
3308 }
3309
3310 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3311 /// operation of specified width.
3312 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3313                        SDValue V2) {
3314   unsigned NumElems = VT.getVectorNumElements();
3315   SmallVector<int, 8> Mask;
3316   Mask.push_back(NumElems);
3317   for (unsigned i = 1; i != NumElems; ++i)
3318     Mask.push_back(i);
3319   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3320 }
3321
3322 SDValue
3323 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3324                              SmallVectorImpl<SDValue> &InVals) const {
3325   SelectionDAG &DAG                     = CLI.DAG;
3326   SDLoc &dl                             = CLI.DL;
3327   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3328   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
3329   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
3330   SDValue Chain                         = CLI.Chain;
3331   SDValue Callee                        = CLI.Callee;
3332   CallingConv::ID CallConv              = CLI.CallConv;
3333   bool &isTailCall                      = CLI.IsTailCall;
3334   bool isVarArg                         = CLI.IsVarArg;
3335
3336   MachineFunction &MF = DAG.getMachineFunction();
3337   bool Is64Bit        = Subtarget.is64Bit();
3338   bool IsWin64        = Subtarget.isCallingConvWin64(CallConv);
3339   StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3340   bool IsSibcall      = false;
3341   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3342   auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3343   const CallInst *CI =
3344       CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3345   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3346   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3347                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3348
3349   if (CallConv == CallingConv::X86_INTR)
3350     report_fatal_error("X86 interrupts may not be called directly");
3351
3352   if (Attr.getValueAsString() == "true")
3353     isTailCall = false;
3354
3355   if (Subtarget.isPICStyleGOT() &&
3356       !MF.getTarget().Options.GuaranteedTailCallOpt) {
3357     // If we are using a GOT, disable tail calls to external symbols with
3358     // default visibility. Tail calling such a symbol requires using a GOT
3359     // relocation, which forces early binding of the symbol. This breaks code
3360     // that require lazy function symbol resolution. Using musttail or
3361     // GuaranteedTailCallOpt will override this.
3362     GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3363     if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3364                G->getGlobal()->hasDefaultVisibility()))
3365       isTailCall = false;
3366   }
3367
3368   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3369   if (IsMustTail) {
3370     // Force this to be a tail call.  The verifier rules are enough to ensure
3371     // that we can lower this successfully without moving the return address
3372     // around.
3373     isTailCall = true;
3374   } else if (isTailCall) {
3375     // Check if it's really possible to do a tail call.
3376     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3377                     isVarArg, SR != NotStructReturn,
3378                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3379                     Outs, OutVals, Ins, DAG);
3380
3381     // Sibcalls are automatically detected tailcalls which do not require
3382     // ABI changes.
3383     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3384       IsSibcall = true;
3385
3386     if (isTailCall)
3387       ++NumTailCalls;
3388   }
3389
3390   assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3391          "Var args not supported with calling convention fastcc, ghc or hipe");
3392
3393   // Analyze operands of the call, assigning locations to each operand.
3394   SmallVector<CCValAssign, 16> ArgLocs;
3395   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3396
3397   // Allocate shadow area for Win64.
3398   if (IsWin64)
3399     CCInfo.AllocateStack(32, 8);
3400
3401   CCInfo.AnalyzeArguments(Outs, CC_X86);
3402
3403   // In vectorcall calling convention a second pass is required for the HVA
3404   // types.
3405   if (CallingConv::X86_VectorCall == CallConv) {
3406     CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3407   }
3408
3409   // Get a count of how many bytes are to be pushed on the stack.
3410   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3411   if (IsSibcall)
3412     // This is a sibcall. The memory operands are available in caller's
3413     // own caller's stack.
3414     NumBytes = 0;
3415   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3416            canGuaranteeTCO(CallConv))
3417     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3418
3419   int FPDiff = 0;
3420   if (isTailCall && !IsSibcall && !IsMustTail) {
3421     // Lower arguments at fp - stackoffset + fpdiff.
3422     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3423
3424     FPDiff = NumBytesCallerPushed - NumBytes;
3425
3426     // Set the delta of movement of the returnaddr stackslot.
3427     // But only set if delta is greater than previous delta.
3428     if (FPDiff < X86Info->getTCReturnAddrDelta())
3429       X86Info->setTCReturnAddrDelta(FPDiff);
3430   }
3431
3432   unsigned NumBytesToPush = NumBytes;
3433   unsigned NumBytesToPop = NumBytes;
3434
3435   // If we have an inalloca argument, all stack space has already been allocated
3436   // for us and be right at the top of the stack.  We don't support multiple
3437   // arguments passed in memory when using inalloca.
3438   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3439     NumBytesToPush = 0;
3440     if (!ArgLocs.back().isMemLoc())
3441       report_fatal_error("cannot use inalloca attribute on a register "
3442                          "parameter");
3443     if (ArgLocs.back().getLocMemOffset() != 0)
3444       report_fatal_error("any parameter with the inalloca attribute must be "
3445                          "the only memory argument");
3446   }
3447
3448   if (!IsSibcall)
3449     Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3450                                  NumBytes - NumBytesToPush, dl);
3451
3452   SDValue RetAddrFrIdx;
3453   // Load return address for tail calls.
3454   if (isTailCall && FPDiff)
3455     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3456                                     Is64Bit, FPDiff, dl);
3457
3458   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3459   SmallVector<SDValue, 8> MemOpChains;
3460   SDValue StackPtr;
3461
3462   // The next loop assumes that the locations are in the same order of the
3463   // input arguments.
3464   assert(isSortedByValueNo(ArgLocs) &&
3465          "Argument Location list must be sorted before lowering");
3466
3467   // Walk the register/memloc assignments, inserting copies/loads.  In the case
3468   // of tail call optimization arguments are handle later.
3469   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3470   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3471        ++I, ++OutIndex) {
3472     assert(OutIndex < Outs.size() && "Invalid Out index");
3473     // Skip inalloca arguments, they have already been written.
3474     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3475     if (Flags.isInAlloca())
3476       continue;
3477
3478     CCValAssign &VA = ArgLocs[I];
3479     EVT RegVT = VA.getLocVT();
3480     SDValue Arg = OutVals[OutIndex];
3481     bool isByVal = Flags.isByVal();
3482
3483     // Promote the value if needed.
3484     switch (VA.getLocInfo()) {
3485     default: llvm_unreachable("Unknown loc info!");
3486     case CCValAssign::Full: break;
3487     case CCValAssign::SExt:
3488       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3489       break;
3490     case CCValAssign::ZExt:
3491       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3492       break;
3493     case CCValAssign::AExt:
3494       if (Arg.getValueType().isVector() &&
3495           Arg.getValueType().getVectorElementType() == MVT::i1)
3496         Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3497       else if (RegVT.is128BitVector()) {
3498         // Special case: passing MMX values in XMM registers.
3499         Arg = DAG.getBitcast(MVT::i64, Arg);
3500         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3501         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3502       } else
3503         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3504       break;
3505     case CCValAssign::BCvt:
3506       Arg = DAG.getBitcast(RegVT, Arg);
3507       break;
3508     case CCValAssign::Indirect: {
3509       // Store the argument.
3510       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3511       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3512       Chain = DAG.getStore(
3513           Chain, dl, Arg, SpillSlot,
3514           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3515       Arg = SpillSlot;
3516       break;
3517     }
3518     }
3519
3520     if (VA.needsCustom()) {
3521       assert(VA.getValVT() == MVT::v64i1 &&
3522              "Currently the only custom case is when we split v64i1 to 2 regs");
3523       // Split v64i1 value into two registers
3524       Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3525                          Subtarget);
3526     } else if (VA.isRegLoc()) {
3527       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3528       if (isVarArg && IsWin64) {
3529         // Win64 ABI requires argument XMM reg to be copied to the corresponding
3530         // shadow reg if callee is a varargs function.
3531         unsigned ShadowReg = 0;
3532         switch (VA.getLocReg()) {
3533         case X86::XMM0: ShadowReg = X86::RCX; break;
3534         case X86::XMM1: ShadowReg = X86::RDX; break;
3535         case X86::XMM2: ShadowReg = X86::R8; break;
3536         case X86::XMM3: ShadowReg = X86::R9; break;
3537         }
3538         if (ShadowReg)
3539           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3540       }
3541     } else if (!IsSibcall && (!isTailCall || isByVal)) {
3542       assert(VA.isMemLoc());
3543       if (!StackPtr.getNode())
3544         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3545                                       getPointerTy(DAG.getDataLayout()));
3546       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3547                                              dl, DAG, VA, Flags));
3548     }
3549   }
3550
3551   if (!MemOpChains.empty())
3552     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3553
3554   if (Subtarget.isPICStyleGOT()) {
3555     // ELF / PIC requires GOT in the EBX register before function calls via PLT
3556     // GOT pointer.
3557     if (!isTailCall) {
3558       RegsToPass.push_back(std::make_pair(
3559           unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3560                                           getPointerTy(DAG.getDataLayout()))));
3561     } else {
3562       // If we are tail calling and generating PIC/GOT style code load the
3563       // address of the callee into ECX. The value in ecx is used as target of
3564       // the tail jump. This is done to circumvent the ebx/callee-saved problem
3565       // for tail calls on PIC/GOT architectures. Normally we would just put the
3566       // address of GOT into ebx and then call target@PLT. But for tail calls
3567       // ebx would be restored (since ebx is callee saved) before jumping to the
3568       // target@PLT.
3569
3570       // Note: The actual moving to ECX is done further down.
3571       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3572       if (G && !G->getGlobal()->hasLocalLinkage() &&
3573           G->getGlobal()->hasDefaultVisibility())
3574         Callee = LowerGlobalAddress(Callee, DAG);
3575       else if (isa<ExternalSymbolSDNode>(Callee))
3576         Callee = LowerExternalSymbol(Callee, DAG);
3577     }
3578   }
3579
3580   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3581     // From AMD64 ABI document:
3582     // For calls that may call functions that use varargs or stdargs
3583     // (prototype-less calls or calls to functions containing ellipsis (...) in
3584     // the declaration) %al is used as hidden argument to specify the number
3585     // of SSE registers used. The contents of %al do not need to match exactly
3586     // the number of registers, but must be an ubound on the number of SSE
3587     // registers used and is in the range 0 - 8 inclusive.
3588
3589     // Count the number of XMM registers allocated.
3590     static const MCPhysReg XMMArgRegs[] = {
3591       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3592       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3593     };
3594     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3595     assert((Subtarget.hasSSE1() || !NumXMMRegs)
3596            && "SSE registers cannot be used when SSE is disabled");
3597
3598     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3599                                         DAG.getConstant(NumXMMRegs, dl,
3600                                                         MVT::i8)));
3601   }
3602
3603   if (isVarArg && IsMustTail) {
3604     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3605     for (const auto &F : Forwards) {
3606       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3607       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3608     }
3609   }
3610
3611   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
3612   // don't need this because the eligibility check rejects calls that require
3613   // shuffling arguments passed in memory.
3614   if (!IsSibcall && isTailCall) {
3615     // Force all the incoming stack arguments to be loaded from the stack
3616     // before any new outgoing arguments are stored to the stack, because the
3617     // outgoing stack slots may alias the incoming argument stack slots, and
3618     // the alias isn't otherwise explicit. This is slightly more conservative
3619     // than necessary, because it means that each store effectively depends
3620     // on every argument instead of just those arguments it would clobber.
3621     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3622
3623     SmallVector<SDValue, 8> MemOpChains2;
3624     SDValue FIN;
3625     int FI = 0;
3626     for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3627          ++I, ++OutsIndex) {
3628       CCValAssign &VA = ArgLocs[I];
3629
3630       if (VA.isRegLoc()) {
3631         if (VA.needsCustom()) {
3632           assert((CallConv == CallingConv::X86_RegCall) &&
3633                  "Expecting custom case only in regcall calling convention");
3634           // This means that we are in special case where one argument was
3635           // passed through two register locations - Skip the next location
3636           ++I;
3637         }
3638
3639         continue;
3640       }
3641
3642       assert(VA.isMemLoc());
3643       SDValue Arg = OutVals[OutsIndex];
3644       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3645       // Skip inalloca arguments.  They don't require any work.
3646       if (Flags.isInAlloca())
3647         continue;
3648       // Create frame index.
3649       int32_t Offset = VA.getLocMemOffset()+FPDiff;
3650       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3651       FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3652       FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3653
3654       if (Flags.isByVal()) {
3655         // Copy relative to framepointer.
3656         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3657         if (!StackPtr.getNode())
3658           StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3659                                         getPointerTy(DAG.getDataLayout()));
3660         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3661                              StackPtr, Source);
3662
3663         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3664                                                          ArgChain,
3665                                                          Flags, DAG, dl));
3666       } else {
3667         // Store relative to framepointer.
3668         MemOpChains2.push_back(DAG.getStore(
3669             ArgChain, dl, Arg, FIN,
3670             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3671       }
3672     }
3673
3674     if (!MemOpChains2.empty())
3675       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3676
3677     // Store the return address to the appropriate stack slot.
3678     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3679                                      getPointerTy(DAG.getDataLayout()),
3680                                      RegInfo->getSlotSize(), FPDiff, dl);
3681   }
3682
3683   // Build a sequence of copy-to-reg nodes chained together with token chain
3684   // and flag operands which copy the outgoing args into registers.
3685   SDValue InFlag;
3686   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3687     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3688                              RegsToPass[i].second, InFlag);
3689     InFlag = Chain.getValue(1);
3690   }
3691
3692   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3693     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3694     // In the 64-bit large code model, we have to make all calls
3695     // through a register, since the call instruction's 32-bit
3696     // pc-relative offset may not be large enough to hold the whole
3697     // address.
3698   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3699     // If the callee is a GlobalAddress node (quite common, every direct call
3700     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3701     // it.
3702     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3703
3704     // We should use extra load for direct calls to dllimported functions in
3705     // non-JIT mode.
3706     const GlobalValue *GV = G->getGlobal();
3707     if (!GV->hasDLLImportStorageClass()) {
3708       unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3709
3710       Callee = DAG.getTargetGlobalAddress(
3711           GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3712
3713       if (OpFlags == X86II::MO_GOTPCREL) {
3714         // Add a wrapper.
3715         Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3716           getPointerTy(DAG.getDataLayout()), Callee);
3717         // Add extra indirection
3718         Callee = DAG.getLoad(
3719             getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3720             MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3721       }
3722     }
3723   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3724     const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3725     unsigned char OpFlags =
3726         Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3727
3728     Callee = DAG.getTargetExternalSymbol(
3729         S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3730   } else if (Subtarget.isTarget64BitILP32() &&
3731              Callee->getValueType(0) == MVT::i32) {
3732     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3733     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3734   }
3735
3736   // Returns a chain & a flag for retval copy to use.
3737   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3738   SmallVector<SDValue, 8> Ops;
3739
3740   if (!IsSibcall && isTailCall) {
3741     Chain = DAG.getCALLSEQ_END(Chain,
3742                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3743                                DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3744     InFlag = Chain.getValue(1);
3745   }
3746
3747   Ops.push_back(Chain);
3748   Ops.push_back(Callee);
3749
3750   if (isTailCall)
3751     Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3752
3753   // Add argument registers to the end of the list so that they are known live
3754   // into the call.
3755   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3756     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3757                                   RegsToPass[i].second.getValueType()));
3758
3759   // Add a register mask operand representing the call-preserved registers.
3760   // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3761   // set X86_INTR calling convention because it has the same CSR mask
3762   // (same preserved registers).
3763   const uint32_t *Mask = RegInfo->getCallPreservedMask(
3764       MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3765   assert(Mask && "Missing call preserved mask for calling convention");
3766
3767   // If this is an invoke in a 32-bit function using a funclet-based
3768   // personality, assume the function clobbers all registers. If an exception
3769   // is thrown, the runtime will not restore CSRs.
3770   // FIXME: Model this more precisely so that we can register allocate across
3771   // the normal edge and spill and fill across the exceptional edge.
3772   if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3773     const Function *CallerFn = MF.getFunction();
3774     EHPersonality Pers =
3775         CallerFn->hasPersonalityFn()
3776             ? classifyEHPersonality(CallerFn->getPersonalityFn())
3777             : EHPersonality::Unknown;
3778     if (isFuncletEHPersonality(Pers))
3779       Mask = RegInfo->getNoPreservedMask();
3780   }
3781
3782   // Define a new register mask from the existing mask.
3783   uint32_t *RegMask = nullptr;
3784
3785   // In some calling conventions we need to remove the used physical registers
3786   // from the reg mask.
3787   if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3788     const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3789
3790     // Allocate a new Reg Mask and copy Mask.
3791     RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3792     unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3793     memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3794
3795     // Make sure all sub registers of the argument registers are reset
3796     // in the RegMask.
3797     for (auto const &RegPair : RegsToPass)
3798       for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3799            SubRegs.isValid(); ++SubRegs)
3800         RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3801
3802     // Create the RegMask Operand according to our updated mask.
3803     Ops.push_back(DAG.getRegisterMask(RegMask));
3804   } else {
3805     // Create the RegMask Operand according to the static mask.
3806     Ops.push_back(DAG.getRegisterMask(Mask));
3807   }
3808
3809   if (InFlag.getNode())
3810     Ops.push_back(InFlag);
3811
3812   if (isTailCall) {
3813     // We used to do:
3814     //// If this is the first return lowered for this function, add the regs
3815     //// to the liveout set for the function.
3816     // This isn't right, although it's probably harmless on x86; liveouts
3817     // should be computed from returns not tail calls.  Consider a void
3818     // function making a tail call to a function returning int.
3819     MF.getFrameInfo().setHasTailCall();
3820     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3821   }
3822
3823   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3824   InFlag = Chain.getValue(1);
3825
3826   // Create the CALLSEQ_END node.
3827   unsigned NumBytesForCalleeToPop;
3828   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3829                        DAG.getTarget().Options.GuaranteedTailCallOpt))
3830     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
3831   else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3832            !Subtarget.getTargetTriple().isOSMSVCRT() &&
3833            SR == StackStructReturn)
3834     // If this is a call to a struct-return function, the callee
3835     // pops the hidden struct pointer, so we have to push it back.
3836     // This is common for Darwin/X86, Linux & Mingw32 targets.
3837     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3838     NumBytesForCalleeToPop = 4;
3839   else
3840     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
3841
3842   if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3843     // No need to reset the stack after the call if the call doesn't return. To
3844     // make the MI verify, we'll pretend the callee does it for us.
3845     NumBytesForCalleeToPop = NumBytes;
3846   }
3847
3848   // Returns a flag for retval copy to use.
3849   if (!IsSibcall) {
3850     Chain = DAG.getCALLSEQ_END(Chain,
3851                                DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3852                                DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3853                                                      true),
3854                                InFlag, dl);
3855     InFlag = Chain.getValue(1);
3856   }
3857
3858   // Handle result values, copying them out of physregs into vregs that we
3859   // return.
3860   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3861                          InVals, RegMask);
3862 }
3863
3864 //===----------------------------------------------------------------------===//
3865 //                Fast Calling Convention (tail call) implementation
3866 //===----------------------------------------------------------------------===//
3867
3868 //  Like std call, callee cleans arguments, convention except that ECX is
3869 //  reserved for storing the tail called function address. Only 2 registers are
3870 //  free for argument passing (inreg). Tail call optimization is performed
3871 //  provided:
3872 //                * tailcallopt is enabled
3873 //                * caller/callee are fastcc
3874 //  On X86_64 architecture with GOT-style position independent code only local
3875 //  (within module) calls are supported at the moment.
3876 //  To keep the stack aligned according to platform abi the function
3877 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
3878 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3879 //  If a tail called function callee has more arguments than the caller the
3880 //  caller needs to make sure that there is room to move the RETADDR to. This is
3881 //  achieved by reserving an area the size of the argument delta right after the
3882 //  original RETADDR, but before the saved framepointer or the spilled registers
3883 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3884 //  stack layout:
3885 //    arg1
3886 //    arg2
3887 //    RETADDR
3888 //    [ new RETADDR
3889 //      move area ]
3890 //    (possible EBP)
3891 //    ESI
3892 //    EDI
3893 //    local1 ..
3894
3895 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3896 /// requirement.
3897 unsigned
3898 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3899                                                SelectionDAG& DAG) const {
3900   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3901   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3902   unsigned StackAlignment = TFI.getStackAlignment();
3903   uint64_t AlignMask = StackAlignment - 1;
3904   int64_t Offset = StackSize;
3905   unsigned SlotSize = RegInfo->getSlotSize();
3906   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3907     // Number smaller than 12 so just add the difference.
3908     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3909   } else {
3910     // Mask out lower bits, add stackalignment once plus the 12 bytes.
3911     Offset = ((~AlignMask) & Offset) + StackAlignment +
3912       (StackAlignment-SlotSize);
3913   }
3914   return Offset;
3915 }
3916
3917 /// Return true if the given stack call argument is already available in the
3918 /// same position (relatively) of the caller's incoming argument stack.
3919 static
3920 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3921                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3922                          const X86InstrInfo *TII, const CCValAssign &VA) {
3923   unsigned Bytes = Arg.getValueSizeInBits() / 8;
3924
3925   for (;;) {
3926     // Look through nodes that don't alter the bits of the incoming value.
3927     unsigned Op = Arg.getOpcode();
3928     if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3929       Arg = Arg.getOperand(0);
3930       continue;
3931     }
3932     if (Op == ISD::TRUNCATE) {
3933       const SDValue &TruncInput = Arg.getOperand(0);
3934       if (TruncInput.getOpcode() == ISD::AssertZext &&
3935           cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3936               Arg.getValueType()) {
3937         Arg = TruncInput.getOperand(0);
3938         continue;
3939       }
3940     }
3941     break;
3942   }
3943
3944   int FI = INT_MAX;
3945   if (Arg.getOpcode() == ISD::CopyFromReg) {
3946     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3947     if (!TargetRegisterInfo::isVirtualRegister(VR))
3948       return false;
3949     MachineInstr *Def = MRI->getVRegDef(VR);
3950     if (!Def)
3951       return false;
3952     if (!Flags.isByVal()) {
3953       if (!TII->isLoadFromStackSlot(*Def, FI))
3954         return false;
3955     } else {
3956       unsigned Opcode = Def->getOpcode();
3957       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3958            Opcode == X86::LEA64_32r) &&
3959           Def->getOperand(1).isFI()) {
3960         FI = Def->getOperand(1).getIndex();
3961         Bytes = Flags.getByValSize();
3962       } else
3963         return false;
3964     }
3965   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3966     if (Flags.isByVal())
3967       // ByVal argument is passed in as a pointer but it's now being
3968       // dereferenced. e.g.
3969       // define @foo(%struct.X* %A) {
3970       //   tail call @bar(%struct.X* byval %A)
3971       // }
3972       return false;
3973     SDValue Ptr = Ld->getBasePtr();
3974     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3975     if (!FINode)
3976       return false;
3977     FI = FINode->getIndex();
3978   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3979     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3980     FI = FINode->getIndex();
3981     Bytes = Flags.getByValSize();
3982   } else
3983     return false;
3984
3985   assert(FI != INT_MAX);
3986   if (!MFI.isFixedObjectIndex(FI))
3987     return false;
3988
3989   if (Offset != MFI.getObjectOffset(FI))
3990     return false;
3991
3992   // If this is not byval, check that the argument stack object is immutable.
3993   // inalloca and argument copy elision can create mutable argument stack
3994   // objects. Byval objects can be mutated, but a byval call intends to pass the
3995   // mutated memory.
3996   if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3997     return false;
3998
3999   if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4000     // If the argument location is wider than the argument type, check that any
4001     // extension flags match.
4002     if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4003         Flags.isSExt() != MFI.isObjectSExt(FI)) {
4004       return false;
4005     }
4006   }
4007
4008   return Bytes == MFI.getObjectSize(FI);
4009 }
4010
4011 /// Check whether the call is eligible for tail call optimization. Targets
4012 /// that want to do tail call optimization should implement this function.
4013 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4014     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4015     bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4016     const SmallVectorImpl<ISD::OutputArg> &Outs,
4017     const SmallVectorImpl<SDValue> &OutVals,
4018     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4019   if (!mayTailCallThisCC(CalleeCC))
4020     return false;
4021
4022   // If -tailcallopt is specified, make fastcc functions tail-callable.
4023   MachineFunction &MF = DAG.getMachineFunction();
4024   const Function *CallerF = MF.getFunction();
4025
4026   // If the function return type is x86_fp80 and the callee return type is not,
4027   // then the FP_EXTEND of the call result is not a nop. It's not safe to
4028   // perform a tailcall optimization here.
4029   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4030     return false;
4031
4032   CallingConv::ID CallerCC = CallerF->getCallingConv();
4033   bool CCMatch = CallerCC == CalleeCC;
4034   bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4035   bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4036
4037   // Win64 functions have extra shadow space for argument homing. Don't do the
4038   // sibcall if the caller and callee have mismatched expectations for this
4039   // space.
4040   if (IsCalleeWin64 != IsCallerWin64)
4041     return false;
4042
4043   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4044     if (canGuaranteeTCO(CalleeCC) && CCMatch)
4045       return true;
4046     return false;
4047   }
4048
4049   // Look for obvious safe cases to perform tail call optimization that do not
4050   // require ABI changes. This is what gcc calls sibcall.
4051
4052   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4053   // emit a special epilogue.
4054   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4055   if (RegInfo->needsStackRealignment(MF))
4056     return false;
4057
4058   // Also avoid sibcall optimization if either caller or callee uses struct
4059   // return semantics.
4060   if (isCalleeStructRet || isCallerStructRet)
4061     return false;
4062
4063   // Do not sibcall optimize vararg calls unless all arguments are passed via
4064   // registers.
4065   LLVMContext &C = *DAG.getContext();
4066   if (isVarArg && !Outs.empty()) {
4067     // Optimizing for varargs on Win64 is unlikely to be safe without
4068     // additional testing.
4069     if (IsCalleeWin64 || IsCallerWin64)
4070       return false;
4071
4072     SmallVector<CCValAssign, 16> ArgLocs;
4073     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4074
4075     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4076     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4077       if (!ArgLocs[i].isRegLoc())
4078         return false;
4079   }
4080
4081   // If the call result is in ST0 / ST1, it needs to be popped off the x87
4082   // stack.  Therefore, if it's not used by the call it is not safe to optimize
4083   // this into a sibcall.
4084   bool Unused = false;
4085   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4086     if (!Ins[i].Used) {
4087       Unused = true;
4088       break;
4089     }
4090   }
4091   if (Unused) {
4092     SmallVector<CCValAssign, 16> RVLocs;
4093     CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4094     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4095     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4096       CCValAssign &VA = RVLocs[i];
4097       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4098         return false;
4099     }
4100   }
4101
4102   // Check that the call results are passed in the same way.
4103   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4104                                   RetCC_X86, RetCC_X86))
4105     return false;
4106   // The callee has to preserve all registers the caller needs to preserve.
4107   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4108   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4109   if (!CCMatch) {
4110     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4111     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4112       return false;
4113   }
4114
4115   unsigned StackArgsSize = 0;
4116
4117   // If the callee takes no arguments then go on to check the results of the
4118   // call.
4119   if (!Outs.empty()) {
4120     // Check if stack adjustment is needed. For now, do not do this if any
4121     // argument is passed on the stack.
4122     SmallVector<CCValAssign, 16> ArgLocs;
4123     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4124
4125     // Allocate shadow area for Win64
4126     if (IsCalleeWin64)
4127       CCInfo.AllocateStack(32, 8);
4128
4129     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4130     StackArgsSize = CCInfo.getNextStackOffset();
4131
4132     if (CCInfo.getNextStackOffset()) {
4133       // Check if the arguments are already laid out in the right way as
4134       // the caller's fixed stack objects.
4135       MachineFrameInfo &MFI = MF.getFrameInfo();
4136       const MachineRegisterInfo *MRI = &MF.getRegInfo();
4137       const X86InstrInfo *TII = Subtarget.getInstrInfo();
4138       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4139         CCValAssign &VA = ArgLocs[i];
4140         SDValue Arg = OutVals[i];
4141         ISD::ArgFlagsTy Flags = Outs[i].Flags;
4142         if (VA.getLocInfo() == CCValAssign::Indirect)
4143           return false;
4144         if (!VA.isRegLoc()) {
4145           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4146                                    MFI, MRI, TII, VA))
4147             return false;
4148         }
4149       }
4150     }
4151
4152     bool PositionIndependent = isPositionIndependent();
4153     // If the tailcall address may be in a register, then make sure it's
4154     // possible to register allocate for it. In 32-bit, the call address can
4155     // only target EAX, EDX, or ECX since the tail call must be scheduled after
4156     // callee-saved registers are restored. These happen to be the same
4157     // registers used to pass 'inreg' arguments so watch out for those.
4158     if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4159                                   !isa<ExternalSymbolSDNode>(Callee)) ||
4160                                  PositionIndependent)) {
4161       unsigned NumInRegs = 0;
4162       // In PIC we need an extra register to formulate the address computation
4163       // for the callee.
4164       unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4165
4166       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4167         CCValAssign &VA = ArgLocs[i];
4168         if (!VA.isRegLoc())
4169           continue;
4170         unsigned Reg = VA.getLocReg();
4171         switch (Reg) {
4172         default: break;
4173         case X86::EAX: case X86::EDX: case X86::ECX:
4174           if (++NumInRegs == MaxInRegs)
4175             return false;
4176           break;
4177         }
4178       }
4179     }
4180
4181     const MachineRegisterInfo &MRI = MF.getRegInfo();
4182     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4183       return false;
4184   }
4185
4186   bool CalleeWillPop =
4187       X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4188                        MF.getTarget().Options.GuaranteedTailCallOpt);
4189
4190   if (unsigned BytesToPop =
4191           MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4192     // If we have bytes to pop, the callee must pop them.
4193     bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4194     if (!CalleePopMatches)
4195       return false;
4196   } else if (CalleeWillPop && StackArgsSize > 0) {
4197     // If we don't have bytes to pop, make sure the callee doesn't pop any.
4198     return false;
4199   }
4200
4201   return true;
4202 }
4203
4204 FastISel *
4205 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4206                                   const TargetLibraryInfo *libInfo) const {
4207   return X86::createFastISel(funcInfo, libInfo);
4208 }
4209
4210 //===----------------------------------------------------------------------===//
4211 //                           Other Lowering Hooks
4212 //===----------------------------------------------------------------------===//
4213
4214 static bool MayFoldLoad(SDValue Op) {
4215   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4216 }
4217
4218 static bool MayFoldIntoStore(SDValue Op) {
4219   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4220 }
4221
4222 static bool MayFoldIntoZeroExtend(SDValue Op) {
4223   if (Op.hasOneUse()) {
4224     unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4225     return (ISD::ZERO_EXTEND == Opcode);
4226   }
4227   return false;
4228 }
4229
4230 static bool isTargetShuffle(unsigned Opcode) {
4231   switch(Opcode) {
4232   default: return false;
4233   case X86ISD::BLENDI:
4234   case X86ISD::PSHUFB:
4235   case X86ISD::PSHUFD:
4236   case X86ISD::PSHUFHW:
4237   case X86ISD::PSHUFLW:
4238   case X86ISD::SHUFP:
4239   case X86ISD::INSERTPS:
4240   case X86ISD::EXTRQI:
4241   case X86ISD::INSERTQI:
4242   case X86ISD::PALIGNR:
4243   case X86ISD::VSHLDQ:
4244   case X86ISD::VSRLDQ:
4245   case X86ISD::MOVLHPS:
4246   case X86ISD::MOVLHPD:
4247   case X86ISD::MOVHLPS:
4248   case X86ISD::MOVLPS:
4249   case X86ISD::MOVLPD:
4250   case X86ISD::MOVSHDUP:
4251   case X86ISD::MOVSLDUP:
4252   case X86ISD::MOVDDUP:
4253   case X86ISD::MOVSS:
4254   case X86ISD::MOVSD:
4255   case X86ISD::UNPCKL:
4256   case X86ISD::UNPCKH:
4257   case X86ISD::VBROADCAST:
4258   case X86ISD::VPERMILPI:
4259   case X86ISD::VPERMILPV:
4260   case X86ISD::VPERM2X128:
4261   case X86ISD::VPERMIL2:
4262   case X86ISD::VPERMI:
4263   case X86ISD::VPPERM:
4264   case X86ISD::VPERMV:
4265   case X86ISD::VPERMV3:
4266   case X86ISD::VPERMIV3:
4267   case X86ISD::VZEXT_MOVL:
4268     return true;
4269   }
4270 }
4271
4272 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4273   switch (Opcode) {
4274   default: return false;
4275   // Target Shuffles.
4276   case X86ISD::PSHUFB:
4277   case X86ISD::VPERMILPV:
4278   case X86ISD::VPERMIL2:
4279   case X86ISD::VPPERM:
4280   case X86ISD::VPERMV:
4281   case X86ISD::VPERMV3:
4282   case X86ISD::VPERMIV3:
4283     return true;
4284   // 'Faux' Target Shuffles.
4285   case ISD::AND:
4286   case X86ISD::ANDNP:
4287     return true;
4288   }
4289 }
4290
4291 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4292   MachineFunction &MF = DAG.getMachineFunction();
4293   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4294   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4295   int ReturnAddrIndex = FuncInfo->getRAIndex();
4296
4297   if (ReturnAddrIndex == 0) {
4298     // Set up a frame object for the return address.
4299     unsigned SlotSize = RegInfo->getSlotSize();
4300     ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4301                                                           -(int64_t)SlotSize,
4302                                                           false);
4303     FuncInfo->setRAIndex(ReturnAddrIndex);
4304   }
4305
4306   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4307 }
4308
4309 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4310                                        bool hasSymbolicDisplacement) {
4311   // Offset should fit into 32 bit immediate field.
4312   if (!isInt<32>(Offset))
4313     return false;
4314
4315   // If we don't have a symbolic displacement - we don't have any extra
4316   // restrictions.
4317   if (!hasSymbolicDisplacement)
4318     return true;
4319
4320   // FIXME: Some tweaks might be needed for medium code model.
4321   if (M != CodeModel::Small && M != CodeModel::Kernel)
4322     return false;
4323
4324   // For small code model we assume that latest object is 16MB before end of 31
4325   // bits boundary. We may also accept pretty large negative constants knowing
4326   // that all objects are in the positive half of address space.
4327   if (M == CodeModel::Small && Offset < 16*1024*1024)
4328     return true;
4329
4330   // For kernel code model we know that all object resist in the negative half
4331   // of 32bits address space. We may not accept negative offsets, since they may
4332   // be just off and we may accept pretty large positive ones.
4333   if (M == CodeModel::Kernel && Offset >= 0)
4334     return true;
4335
4336   return false;
4337 }
4338
4339 /// Determines whether the callee is required to pop its own arguments.
4340 /// Callee pop is necessary to support tail calls.
4341 bool X86::isCalleePop(CallingConv::ID CallingConv,
4342                       bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4343   // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4344   // can guarantee TCO.
4345   if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4346     return true;
4347
4348   switch (CallingConv) {
4349   default:
4350     return false;
4351   case CallingConv::X86_StdCall:
4352   case CallingConv::X86_FastCall:
4353   case CallingConv::X86_ThisCall:
4354   case CallingConv::X86_VectorCall:
4355     return !is64Bit;
4356   }
4357 }
4358
4359 /// \brief Return true if the condition is an unsigned comparison operation.
4360 static bool isX86CCUnsigned(unsigned X86CC) {
4361   switch (X86CC) {
4362   default:
4363     llvm_unreachable("Invalid integer condition!");
4364   case X86::COND_E:
4365   case X86::COND_NE:
4366   case X86::COND_B:
4367   case X86::COND_A:
4368   case X86::COND_BE:
4369   case X86::COND_AE:
4370     return true;
4371   case X86::COND_G:
4372   case X86::COND_GE:
4373   case X86::COND_L:
4374   case X86::COND_LE:
4375     return false;
4376   }
4377 }
4378
4379 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4380   switch (SetCCOpcode) {
4381   default: llvm_unreachable("Invalid integer condition!");
4382   case ISD::SETEQ:  return X86::COND_E;
4383   case ISD::SETGT:  return X86::COND_G;
4384   case ISD::SETGE:  return X86::COND_GE;
4385   case ISD::SETLT:  return X86::COND_L;
4386   case ISD::SETLE:  return X86::COND_LE;
4387   case ISD::SETNE:  return X86::COND_NE;
4388   case ISD::SETULT: return X86::COND_B;
4389   case ISD::SETUGT: return X86::COND_A;
4390   case ISD::SETULE: return X86::COND_BE;
4391   case ISD::SETUGE: return X86::COND_AE;
4392   }
4393 }
4394
4395 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4396 /// condition code, returning the condition code and the LHS/RHS of the
4397 /// comparison to make.
4398 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4399                                bool isFP, SDValue &LHS, SDValue &RHS,
4400                                SelectionDAG &DAG) {
4401   if (!isFP) {
4402     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4403       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4404         // X > -1   -> X == 0, jump !sign.
4405         RHS = DAG.getConstant(0, DL, RHS.getValueType());
4406         return X86::COND_NS;
4407       }
4408       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4409         // X < 0   -> X == 0, jump on sign.
4410         return X86::COND_S;
4411       }
4412       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4413         // X < 1   -> X <= 0
4414         RHS = DAG.getConstant(0, DL, RHS.getValueType());
4415         return X86::COND_LE;
4416       }
4417     }
4418
4419     return TranslateIntegerX86CC(SetCCOpcode);
4420   }
4421
4422   // First determine if it is required or is profitable to flip the operands.
4423
4424   // If LHS is a foldable load, but RHS is not, flip the condition.
4425   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4426       !ISD::isNON_EXTLoad(RHS.getNode())) {
4427     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4428     std::swap(LHS, RHS);
4429   }
4430
4431   switch (SetCCOpcode) {
4432   default: break;
4433   case ISD::SETOLT:
4434   case ISD::SETOLE:
4435   case ISD::SETUGT:
4436   case ISD::SETUGE:
4437     std::swap(LHS, RHS);
4438     break;
4439   }
4440
4441   // On a floating point condition, the flags are set as follows:
4442   // ZF  PF  CF   op
4443   //  0 | 0 | 0 | X > Y
4444   //  0 | 0 | 1 | X < Y
4445   //  1 | 0 | 0 | X == Y
4446   //  1 | 1 | 1 | unordered
4447   switch (SetCCOpcode) {
4448   default: llvm_unreachable("Condcode should be pre-legalized away");
4449   case ISD::SETUEQ:
4450   case ISD::SETEQ:   return X86::COND_E;
4451   case ISD::SETOLT:              // flipped
4452   case ISD::SETOGT:
4453   case ISD::SETGT:   return X86::COND_A;
4454   case ISD::SETOLE:              // flipped
4455   case ISD::SETOGE:
4456   case ISD::SETGE:   return X86::COND_AE;
4457   case ISD::SETUGT:              // flipped
4458   case ISD::SETULT:
4459   case ISD::SETLT:   return X86::COND_B;
4460   case ISD::SETUGE:              // flipped
4461   case ISD::SETULE:
4462   case ISD::SETLE:   return X86::COND_BE;
4463   case ISD::SETONE:
4464   case ISD::SETNE:   return X86::COND_NE;
4465   case ISD::SETUO:   return X86::COND_P;
4466   case ISD::SETO:    return X86::COND_NP;
4467   case ISD::SETOEQ:
4468   case ISD::SETUNE:  return X86::COND_INVALID;
4469   }
4470 }
4471
4472 /// Is there a floating point cmov for the specific X86 condition code?
4473 /// Current x86 isa includes the following FP cmov instructions:
4474 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4475 static bool hasFPCMov(unsigned X86CC) {
4476   switch (X86CC) {
4477   default:
4478     return false;
4479   case X86::COND_B:
4480   case X86::COND_BE:
4481   case X86::COND_E:
4482   case X86::COND_P:
4483   case X86::COND_A:
4484   case X86::COND_AE:
4485   case X86::COND_NE:
4486   case X86::COND_NP:
4487     return true;
4488   }
4489 }
4490
4491
4492 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4493                                            const CallInst &I,
4494                                            unsigned Intrinsic) const {
4495
4496   const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4497   if (!IntrData)
4498     return false;
4499
4500   Info.opc = ISD::INTRINSIC_W_CHAIN;
4501   Info.readMem = false;
4502   Info.writeMem = false;
4503   Info.vol = false;
4504   Info.offset = 0;
4505
4506   switch (IntrData->Type) {
4507   case EXPAND_FROM_MEM: {
4508     Info.ptrVal = I.getArgOperand(0);
4509     Info.memVT = MVT::getVT(I.getType());
4510     Info.align = 1;
4511     Info.readMem = true;
4512     break;
4513   }
4514   case COMPRESS_TO_MEM: {
4515     Info.ptrVal = I.getArgOperand(0);
4516     Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4517     Info.align = 1;
4518     Info.writeMem = true;
4519     break;
4520   }
4521   case TRUNCATE_TO_MEM_VI8:
4522   case TRUNCATE_TO_MEM_VI16:
4523   case TRUNCATE_TO_MEM_VI32: {
4524     Info.ptrVal = I.getArgOperand(0);
4525     MVT VT  = MVT::getVT(I.getArgOperand(1)->getType());
4526     MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4527     if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4528       ScalarVT = MVT::i8;
4529     else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4530       ScalarVT = MVT::i16;
4531     else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4532       ScalarVT = MVT::i32;
4533
4534     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4535     Info.align = 1;
4536     Info.writeMem = true;
4537     break;
4538   }
4539   default:
4540     return false;
4541   }
4542
4543   return true;
4544 }
4545
4546 /// Returns true if the target can instruction select the
4547 /// specified FP immediate natively. If false, the legalizer will
4548 /// materialize the FP immediate as a load from a constant pool.
4549 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4550   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4551     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4552       return true;
4553   }
4554   return false;
4555 }
4556
4557 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4558                                               ISD::LoadExtType ExtTy,
4559                                               EVT NewVT) const {
4560   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4561   // relocation target a movq or addq instruction: don't let the load shrink.
4562   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4563   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4564     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4565       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4566   return true;
4567 }
4568
4569 /// \brief Returns true if it is beneficial to convert a load of a constant
4570 /// to just the constant itself.
4571 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4572                                                           Type *Ty) const {
4573   assert(Ty->isIntegerTy());
4574
4575   unsigned BitSize = Ty->getPrimitiveSizeInBits();
4576   if (BitSize == 0 || BitSize > 64)
4577     return false;
4578   return true;
4579 }
4580
4581 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4582                                                 unsigned Index) const {
4583   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4584     return false;
4585
4586   return (Index == 0 || Index == ResVT.getVectorNumElements());
4587 }
4588
4589 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4590   // Speculate cttz only if we can directly use TZCNT.
4591   return Subtarget.hasBMI();
4592 }
4593
4594 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4595   // Speculate ctlz only if we can directly use LZCNT.
4596   return Subtarget.hasLZCNT();
4597 }
4598
4599 bool X86TargetLowering::isCtlzFast() const {
4600   return Subtarget.hasFastLZCNT();
4601 }
4602
4603 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4604     const Instruction &AndI) const {
4605   return true;
4606 }
4607
4608 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4609   if (!Subtarget.hasBMI())
4610     return false;
4611
4612   // There are only 32-bit and 64-bit forms for 'andn'.
4613   EVT VT = Y.getValueType();
4614   if (VT != MVT::i32 && VT != MVT::i64)
4615     return false;
4616
4617   return true;
4618 }
4619
4620 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4621   MVT VT = MVT::getIntegerVT(NumBits);
4622   if (isTypeLegal(VT))
4623     return VT;
4624
4625   // PMOVMSKB can handle this.
4626   if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4627     return MVT::v16i8;
4628
4629   // VPMOVMSKB can handle this.
4630   if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4631     return MVT::v32i8;
4632
4633   // TODO: Allow 64-bit type for 32-bit target.
4634   // TODO: 512-bit types should be allowed, but make sure that those
4635   // cases are handled in combineVectorSizedSetCCEquality().
4636
4637   return MVT::INVALID_SIMPLE_VALUE_TYPE;
4638 }
4639
4640 /// Val is the undef sentinel value or equal to the specified value.
4641 static bool isUndefOrEqual(int Val, int CmpVal) {
4642   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4643 }
4644
4645 /// Val is either the undef or zero sentinel value.
4646 static bool isUndefOrZero(int Val) {
4647   return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4648 }
4649
4650 /// Return true if every element in Mask, beginning
4651 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4652 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4653   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4654     if (Mask[i] != SM_SentinelUndef)
4655       return false;
4656   return true;
4657 }
4658
4659 /// Return true if Val is undef or if its value falls within the
4660 /// specified range (L, H].
4661 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4662   return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4663 }
4664
4665 /// Return true if every element in Mask is undef or if its value
4666 /// falls within the specified range (L, H].
4667 static bool isUndefOrInRange(ArrayRef<int> Mask,
4668                              int Low, int Hi) {
4669   for (int M : Mask)
4670     if (!isUndefOrInRange(M, Low, Hi))
4671       return false;
4672   return true;
4673 }
4674
4675 /// Return true if Val is undef, zero or if its value falls within the
4676 /// specified range (L, H].
4677 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4678   return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4679 }
4680
4681 /// Return true if every element in Mask is undef, zero or if its value
4682 /// falls within the specified range (L, H].
4683 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4684   for (int M : Mask)
4685     if (!isUndefOrZeroOrInRange(M, Low, Hi))
4686       return false;
4687   return true;
4688 }
4689
4690 /// Return true if every element in Mask, beginning
4691 /// from position Pos and ending in Pos+Size, falls within the specified
4692 /// sequential range (Low, Low+Size]. or is undef.
4693 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4694                                        unsigned Pos, unsigned Size, int Low) {
4695   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4696     if (!isUndefOrEqual(Mask[i], Low))
4697       return false;
4698   return true;
4699 }
4700
4701 /// Return true if every element in Mask, beginning
4702 /// from position Pos and ending in Pos+Size, falls within the specified
4703 /// sequential range (Low, Low+Size], or is undef or is zero.
4704 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4705                                              unsigned Size, int Low) {
4706   for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4707     if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4708       return false;
4709   return true;
4710 }
4711
4712 /// Return true if every element in Mask, beginning
4713 /// from position Pos and ending in Pos+Size is undef or is zero.
4714 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4715                                  unsigned Size) {
4716   for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4717     if (!isUndefOrZero(Mask[i]))
4718       return false;
4719   return true;
4720 }
4721
4722 /// \brief Helper function to test whether a shuffle mask could be
4723 /// simplified by widening the elements being shuffled.
4724 ///
4725 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4726 /// leaves it in an unspecified state.
4727 ///
4728 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4729 /// shuffle masks. The latter have the special property of a '-2' representing
4730 /// a zero-ed lane of a vector.
4731 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4732                                     SmallVectorImpl<int> &WidenedMask) {
4733   WidenedMask.assign(Mask.size() / 2, 0);
4734   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4735     int M0 = Mask[i];
4736     int M1 = Mask[i + 1];
4737
4738     // If both elements are undef, its trivial.
4739     if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4740       WidenedMask[i / 2] = SM_SentinelUndef;
4741       continue;
4742     }
4743
4744     // Check for an undef mask and a mask value properly aligned to fit with
4745     // a pair of values. If we find such a case, use the non-undef mask's value.
4746     if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4747       WidenedMask[i / 2] = M1 / 2;
4748       continue;
4749     }
4750     if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4751       WidenedMask[i / 2] = M0 / 2;
4752       continue;
4753     }
4754
4755     // When zeroing, we need to spread the zeroing across both lanes to widen.
4756     if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4757       if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4758           (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4759         WidenedMask[i / 2] = SM_SentinelZero;
4760         continue;
4761       }
4762       return false;
4763     }
4764
4765     // Finally check if the two mask values are adjacent and aligned with
4766     // a pair.
4767     if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4768       WidenedMask[i / 2] = M0 / 2;
4769       continue;
4770     }
4771
4772     // Otherwise we can't safely widen the elements used in this shuffle.
4773     return false;
4774   }
4775   assert(WidenedMask.size() == Mask.size() / 2 &&
4776          "Incorrect size of mask after widening the elements!");
4777
4778   return true;
4779 }
4780
4781 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4782 /// mask index with the scaled sequential indices for an equivalent narrowed
4783 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4784 /// succeed.
4785 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4786                              SmallVectorImpl<int> &ScaledMask) {
4787   assert(0 < Scale && "Unexpected scaling factor");
4788   int NumElts = Mask.size();
4789   ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
4790
4791   for (int i = 0; i != NumElts; ++i) {
4792     int M = Mask[i];
4793
4794     // Repeat sentinel values in every mask element.
4795     if (M < 0) {
4796       for (int s = 0; s != Scale; ++s)
4797         ScaledMask[(Scale * i) + s] = M;
4798       continue;
4799     }
4800
4801     // Scale mask element and increment across each mask element.
4802     for (int s = 0; s != Scale; ++s)
4803       ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4804   }
4805 }
4806
4807 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4808 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4809 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4810   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4811   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4812     return false;
4813
4814   // The index should be aligned on a vecWidth-bit boundary.
4815   uint64_t Index = N->getConstantOperandVal(1);
4816   MVT VT = N->getSimpleValueType(0);
4817   unsigned ElSize = VT.getScalarSizeInBits();
4818   return (Index * ElSize) % vecWidth == 0;
4819 }
4820
4821 /// Return true if the specified INSERT_SUBVECTOR
4822 /// operand specifies a subvector insert that is suitable for input to
4823 /// insertion of 128 or 256-bit subvectors
4824 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4825   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4826   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4827     return false;
4828
4829   // The index should be aligned on a vecWidth-bit boundary.
4830   uint64_t Index = N->getConstantOperandVal(2);
4831   MVT VT = N->getSimpleValueType(0);
4832   unsigned ElSize = VT.getScalarSizeInBits();
4833   return (Index * ElSize) % vecWidth == 0;
4834 }
4835
4836 bool X86::isVINSERT128Index(SDNode *N) {
4837   return isVINSERTIndex(N, 128);
4838 }
4839
4840 bool X86::isVINSERT256Index(SDNode *N) {
4841   return isVINSERTIndex(N, 256);
4842 }
4843
4844 bool X86::isVEXTRACT128Index(SDNode *N) {
4845   return isVEXTRACTIndex(N, 128);
4846 }
4847
4848 bool X86::isVEXTRACT256Index(SDNode *N) {
4849   return isVEXTRACTIndex(N, 256);
4850 }
4851
4852 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4853   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4854   assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4855          "Illegal extract subvector for VEXTRACT");
4856
4857   uint64_t Index = N->getConstantOperandVal(1);
4858   MVT VecVT = N->getOperand(0).getSimpleValueType();
4859   unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4860   return Index / NumElemsPerChunk;
4861 }
4862
4863 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4864   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4865   assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4866          "Illegal insert subvector for VINSERT");
4867
4868   uint64_t Index = N->getConstantOperandVal(2);
4869   MVT VecVT = N->getSimpleValueType(0);
4870   unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4871   return Index / NumElemsPerChunk;
4872 }
4873
4874 /// Return the appropriate immediate to extract the specified
4875 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4876 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4877   return getExtractVEXTRACTImmediate(N, 128);
4878 }
4879
4880 /// Return the appropriate immediate to extract the specified
4881 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4882 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4883   return getExtractVEXTRACTImmediate(N, 256);
4884 }
4885
4886 /// Return the appropriate immediate to insert at the specified
4887 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4888 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4889   return getInsertVINSERTImmediate(N, 128);
4890 }
4891
4892 /// Return the appropriate immediate to insert at the specified
4893 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4894 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4895   return getInsertVINSERTImmediate(N, 256);
4896 }
4897
4898 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4899 bool X86::isZeroNode(SDValue Elt) {
4900   return isNullConstant(Elt) || isNullFPConstant(Elt);
4901 }
4902
4903 // Build a vector of constants.
4904 // Use an UNDEF node if MaskElt == -1.
4905 // Split 64-bit constants in the 32-bit mode.
4906 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4907                               const SDLoc &dl, bool IsMask = false) {
4908
4909   SmallVector<SDValue, 32>  Ops;
4910   bool Split = false;
4911
4912   MVT ConstVecVT = VT;
4913   unsigned NumElts = VT.getVectorNumElements();
4914   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4915   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4916     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4917     Split = true;
4918   }
4919
4920   MVT EltVT = ConstVecVT.getVectorElementType();
4921   for (unsigned i = 0; i < NumElts; ++i) {
4922     bool IsUndef = Values[i] < 0 && IsMask;
4923     SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4924       DAG.getConstant(Values[i], dl, EltVT);
4925     Ops.push_back(OpNode);
4926     if (Split)
4927       Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4928                     DAG.getConstant(0, dl, EltVT));
4929   }
4930   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4931   if (Split)
4932     ConstsNode = DAG.getBitcast(VT, ConstsNode);
4933   return ConstsNode;
4934 }
4935
4936 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4937                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4938   assert(Bits.size() == Undefs.getBitWidth() &&
4939          "Unequal constant and undef arrays");
4940   SmallVector<SDValue, 32> Ops;
4941   bool Split = false;
4942
4943   MVT ConstVecVT = VT;
4944   unsigned NumElts = VT.getVectorNumElements();
4945   bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4946   if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4947     ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4948     Split = true;
4949   }
4950
4951   MVT EltVT = ConstVecVT.getVectorElementType();
4952   for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4953     if (Undefs[i]) {
4954       Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4955       continue;
4956     }
4957     const APInt &V = Bits[i];
4958     assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4959     if (Split) {
4960       Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4961       Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4962     } else if (EltVT == MVT::f32) {
4963       APFloat FV(APFloat::IEEEsingle(), V);
4964       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4965     } else if (EltVT == MVT::f64) {
4966       APFloat FV(APFloat::IEEEdouble(), V);
4967       Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4968     } else {
4969       Ops.push_back(DAG.getConstant(V, dl, EltVT));
4970     }
4971   }
4972
4973   SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4974   return DAG.getBitcast(VT, ConstsNode);
4975 }
4976
4977 /// Returns a vector of specified type with all zero elements.
4978 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4979                              SelectionDAG &DAG, const SDLoc &dl) {
4980   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4981           VT.getVectorElementType() == MVT::i1) &&
4982          "Unexpected vector type");
4983
4984   // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4985   // type. This ensures they get CSE'd. But if the integer type is not
4986   // available, use a floating-point +0.0 instead.
4987   SDValue Vec;
4988   if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4989     Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4990   } else if (VT.getVectorElementType() == MVT::i1) {
4991     assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4992            "Unexpected vector type");
4993     assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4994            "Unexpected vector type");
4995     Vec = DAG.getConstant(0, dl, VT);
4996   } else {
4997     unsigned Num32BitElts = VT.getSizeInBits() / 32;
4998     Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4999   }
5000   return DAG.getBitcast(VT, Vec);
5001 }
5002
5003 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5004                                 const SDLoc &dl, unsigned vectorWidth) {
5005   EVT VT = Vec.getValueType();
5006   EVT ElVT = VT.getVectorElementType();
5007   unsigned Factor = VT.getSizeInBits()/vectorWidth;
5008   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5009                                   VT.getVectorNumElements()/Factor);
5010
5011   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
5012   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5013   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5014
5015   // This is the index of the first element of the vectorWidth-bit chunk
5016   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5017   IdxVal &= ~(ElemsPerChunk - 1);
5018
5019   // If the input is a buildvector just emit a smaller one.
5020   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5021     return DAG.getBuildVector(
5022         ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5023
5024   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5025   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5026 }
5027
5028 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
5029 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5030 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5031 /// instructions or a simple subregister reference. Idx is an index in the
5032 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
5033 /// lowering EXTRACT_VECTOR_ELT operations easier.
5034 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5035                                    SelectionDAG &DAG, const SDLoc &dl) {
5036   assert((Vec.getValueType().is256BitVector() ||
5037           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5038   return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5039 }
5040
5041 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5042 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5043                                    SelectionDAG &DAG, const SDLoc &dl) {
5044   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5045   return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5046 }
5047
5048 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5049                                SelectionDAG &DAG, const SDLoc &dl,
5050                                unsigned vectorWidth) {
5051   assert((vectorWidth == 128 || vectorWidth == 256) &&
5052          "Unsupported vector width");
5053   // Inserting UNDEF is Result
5054   if (Vec.isUndef())
5055     return Result;
5056   EVT VT = Vec.getValueType();
5057   EVT ElVT = VT.getVectorElementType();
5058   EVT ResultVT = Result.getValueType();
5059
5060   // Insert the relevant vectorWidth bits.
5061   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5062   assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5063
5064   // This is the index of the first element of the vectorWidth-bit chunk
5065   // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5066   IdxVal &= ~(ElemsPerChunk - 1);
5067
5068   SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5069   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5070 }
5071
5072 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
5073 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5074 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5075 /// simple superregister reference.  Idx is an index in the 128 bits
5076 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
5077 /// lowering INSERT_VECTOR_ELT operations easier.
5078 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5079                                   SelectionDAG &DAG, const SDLoc &dl) {
5080   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5081   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5082 }
5083
5084 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5085                                   SelectionDAG &DAG, const SDLoc &dl) {
5086   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5087   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5088 }
5089
5090 // Return true if the instruction zeroes the unused upper part of the
5091 // destination and accepts mask.
5092 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5093   switch (Opcode) {
5094   default:
5095     return false;
5096   case X86ISD::PCMPEQM:
5097   case X86ISD::PCMPGTM:
5098   case X86ISD::CMPM:
5099   case X86ISD::CMPMU:
5100     return true;
5101   }
5102 }
5103
5104 /// Insert i1-subvector to i1-vector.
5105 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5106                                 const X86Subtarget &Subtarget) {
5107
5108   SDLoc dl(Op);
5109   SDValue Vec = Op.getOperand(0);
5110   SDValue SubVec = Op.getOperand(1);
5111   SDValue Idx = Op.getOperand(2);
5112
5113   if (!isa<ConstantSDNode>(Idx))
5114     return SDValue();
5115
5116   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5117   if (IdxVal == 0  && Vec.isUndef()) // the operation is legal
5118     return Op;
5119
5120   MVT OpVT = Op.getSimpleValueType();
5121   MVT SubVecVT = SubVec.getSimpleValueType();
5122   unsigned NumElems = OpVT.getVectorNumElements();
5123   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5124
5125   assert(IdxVal + SubVecNumElems <= NumElems &&
5126          IdxVal % SubVecVT.getSizeInBits() == 0 &&
5127          "Unexpected index value in INSERT_SUBVECTOR");
5128
5129   // There are 3 possible cases:
5130   // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5131   // 2. Subvector should be inserted in the upper part
5132   //    (IdxVal + SubVecNumElems == NumElems)
5133   // 3. Subvector should be inserted in the middle (for example v2i1
5134   //    to v16i1, index 2)
5135
5136   // If this node widens - by concatenating zeroes - the type of the result
5137   // of a node with instruction that zeroes all upper (irrelevant) bits of the
5138   // output register, mark this node as legal to enable replacing them with
5139   // the v8i1 version of the previous instruction during instruction selection.
5140   // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5141   // while zeroing all the upper remaining 60 bits of the register. if the
5142   // result of such instruction is inserted into an allZeroVector, then we can
5143   // safely remove insert_vector (in instruction selection) as the cmp instr
5144   // already zeroed the rest of the register.
5145   if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5146       (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5147        (SubVec.getOpcode() == ISD::AND &&
5148         (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5149          isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5150     return Op;
5151
5152   // extend to natively supported kshift
5153   MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5154   MVT WideOpVT = OpVT;
5155   if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5156     WideOpVT = MinVT;
5157
5158   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5159   SDValue Undef = DAG.getUNDEF(WideOpVT);
5160   SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5161                                    Undef, SubVec, ZeroIdx);
5162
5163   // Extract sub-vector if require.
5164   auto ExtractSubVec = [&](SDValue V) {
5165     return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5166                                                 OpVT, V, ZeroIdx);
5167   };
5168
5169   if (Vec.isUndef()) {
5170     if (IdxVal != 0) {
5171       SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5172       WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5173                                ShiftBits);
5174     }
5175     return ExtractSubVec(WideSubVec);
5176   }
5177
5178   if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5179     NumElems = WideOpVT.getVectorNumElements();
5180     unsigned ShiftLeft = NumElems - SubVecNumElems;
5181     unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5182     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5183                       DAG.getConstant(ShiftLeft, dl, MVT::i8));
5184     Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5185       DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5186     return ExtractSubVec(Vec);
5187   }
5188
5189   if (IdxVal == 0) {
5190     // Zero lower bits of the Vec
5191     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5192     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5193     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5194     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5195     // Merge them together, SubVec should be zero extended.
5196     WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5197                              getZeroVector(WideOpVT, Subtarget, DAG, dl),
5198                              SubVec, ZeroIdx);
5199     Vec =  DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5200     return ExtractSubVec(Vec);
5201   }
5202
5203   // Simple case when we put subvector in the upper part
5204   if (IdxVal + SubVecNumElems == NumElems) {
5205     // Zero upper bits of the Vec
5206     WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5207                              DAG.getConstant(IdxVal, dl, MVT::i8));
5208     SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5209     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5210     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5211     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5212     Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5213     return ExtractSubVec(Vec);
5214   }
5215   // Subvector should be inserted in the middle - use shuffle
5216   WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5217                            SubVec, ZeroIdx);
5218   SmallVector<int, 64> Mask;
5219   for (unsigned i = 0; i < NumElems; ++i)
5220     Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5221                     i : i + NumElems);
5222   return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5223 }
5224
5225 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5226 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5227 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5228 /// large BUILD_VECTORS.
5229 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5230                                    unsigned NumElems, SelectionDAG &DAG,
5231                                    const SDLoc &dl) {
5232   SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5233   return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5234 }
5235
5236 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5237                                    unsigned NumElems, SelectionDAG &DAG,
5238                                    const SDLoc &dl) {
5239   SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5240   return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5241 }
5242
5243 /// Returns a vector of specified type with all bits set.
5244 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5245 /// Then bitcast to their original type, ensuring they get CSE'd.
5246 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5247   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5248          "Expected a 128/256/512-bit vector type");
5249
5250   APInt Ones = APInt::getAllOnesValue(32);
5251   unsigned NumElts = VT.getSizeInBits() / 32;
5252   SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5253   return DAG.getBitcast(VT, Vec);
5254 }
5255
5256 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5257                               SelectionDAG &DAG) {
5258   EVT InVT = In.getValueType();
5259   assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5260
5261   if (VT.is128BitVector() && InVT.is128BitVector())
5262     return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5263                                 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5264
5265   // For 256-bit vectors, we only need the lower (128-bit) input half.
5266   // For 512-bit vectors, we only need the lower input half or quarter.
5267   if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5268     int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5269     In = extractSubVector(In, 0, DAG, DL,
5270                           std::max(128, (int)VT.getSizeInBits() / Scale));
5271   }
5272
5273   return DAG.getNode(Opc, DL, VT, In);
5274 }
5275
5276 /// Generate unpacklo/unpackhi shuffle mask.
5277 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5278                                     bool Unary) {
5279   assert(Mask.empty() && "Expected an empty shuffle mask vector");
5280   int NumElts = VT.getVectorNumElements();
5281   int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5282
5283   for (int i = 0; i < NumElts; ++i) {
5284     unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5285     int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5286     Pos += (Unary ? 0 : NumElts * (i % 2));
5287     Pos += (Lo ? 0 : NumEltsInLane / 2);
5288     Mask.push_back(Pos);
5289   }
5290 }
5291
5292 /// Returns a vector_shuffle node for an unpackl operation.
5293 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5294                           SDValue V1, SDValue V2) {
5295   SmallVector<int, 8> Mask;
5296   createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5297   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5298 }
5299
5300 /// Returns a vector_shuffle node for an unpackh operation.
5301 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5302                           SDValue V1, SDValue V2) {
5303   SmallVector<int, 8> Mask;
5304   createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5305   return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5306 }
5307
5308 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5309 /// This produces a shuffle where the low element of V2 is swizzled into the
5310 /// zero/undef vector, landing at element Idx.
5311 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
5312 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5313                                            bool IsZero,
5314                                            const X86Subtarget &Subtarget,
5315                                            SelectionDAG &DAG) {
5316   MVT VT = V2.getSimpleValueType();
5317   SDValue V1 = IsZero
5318     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5319   int NumElems = VT.getVectorNumElements();
5320   SmallVector<int, 16> MaskVec(NumElems);
5321   for (int i = 0; i != NumElems; ++i)
5322     // If this is the insertion idx, put the low elt of V2 here.
5323     MaskVec[i] = (i == Idx) ? NumElems : i;
5324   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5325 }
5326
5327 static SDValue peekThroughBitcasts(SDValue V) {
5328   while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5329     V = V.getOperand(0);
5330   return V;
5331 }
5332
5333 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5334   while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5335          V.getOperand(0).hasOneUse())
5336     V = V.getOperand(0);
5337   return V;
5338 }
5339
5340 static const Constant *getTargetConstantFromNode(SDValue Op) {
5341   Op = peekThroughBitcasts(Op);
5342
5343   auto *Load = dyn_cast<LoadSDNode>(Op);
5344   if (!Load)
5345     return nullptr;
5346
5347   SDValue Ptr = Load->getBasePtr();
5348   if (Ptr->getOpcode() == X86ISD::Wrapper ||
5349       Ptr->getOpcode() == X86ISD::WrapperRIP)
5350     Ptr = Ptr->getOperand(0);
5351
5352   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5353   if (!CNode || CNode->isMachineConstantPoolEntry())
5354     return nullptr;
5355
5356   return dyn_cast<Constant>(CNode->getConstVal());
5357 }
5358
5359 // Extract raw constant bits from constant pools.
5360 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5361                                           APInt &UndefElts,
5362                                           SmallVectorImpl<APInt> &EltBits,
5363                                           bool AllowWholeUndefs = true,
5364                                           bool AllowPartialUndefs = true) {
5365   assert(EltBits.empty() && "Expected an empty EltBits vector");
5366
5367   Op = peekThroughBitcasts(Op);
5368
5369   EVT VT = Op.getValueType();
5370   unsigned SizeInBits = VT.getSizeInBits();
5371   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5372   unsigned NumElts = SizeInBits / EltSizeInBits;
5373
5374   // Bitcast a source array of element bits to the target size.
5375   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5376     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5377     unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5378     assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5379            "Constant bit sizes don't match");
5380
5381     // Don't split if we don't allow undef bits.
5382     bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5383     if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5384       return false;
5385
5386     // If we're already the right size, don't bother bitcasting.
5387     if (NumSrcElts == NumElts) {
5388       UndefElts = UndefSrcElts;
5389       EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5390       return true;
5391     }
5392
5393     // Extract all the undef/constant element data and pack into single bitsets.
5394     APInt UndefBits(SizeInBits, 0);
5395     APInt MaskBits(SizeInBits, 0);
5396
5397     for (unsigned i = 0; i != NumSrcElts; ++i) {
5398       unsigned BitOffset = i * SrcEltSizeInBits;
5399       if (UndefSrcElts[i])
5400         UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5401       MaskBits.insertBits(SrcEltBits[i], BitOffset);
5402     }
5403
5404     // Split the undef/constant single bitset data into the target elements.
5405     UndefElts = APInt(NumElts, 0);
5406     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5407
5408     for (unsigned i = 0; i != NumElts; ++i) {
5409       unsigned BitOffset = i * EltSizeInBits;
5410       APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5411
5412       // Only treat an element as UNDEF if all bits are UNDEF.
5413       if (UndefEltBits.isAllOnesValue()) {
5414         if (!AllowWholeUndefs)
5415           return false;
5416         UndefElts.setBit(i);
5417         continue;
5418       }
5419
5420       // If only some bits are UNDEF then treat them as zero (or bail if not
5421       // supported).
5422       if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5423         return false;
5424
5425       APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5426       EltBits[i] = Bits.getZExtValue();
5427     }
5428     return true;
5429   };
5430
5431   // Collect constant bits and insert into mask/undef bit masks.
5432   auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5433                                 unsigned UndefBitIndex) {
5434     if (!Cst)
5435       return false;
5436     if (isa<UndefValue>(Cst)) {
5437       Undefs.setBit(UndefBitIndex);
5438       return true;
5439     }
5440     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5441       Mask = CInt->getValue();
5442       return true;
5443     }
5444     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5445       Mask = CFP->getValueAPF().bitcastToAPInt();
5446       return true;
5447     }
5448     return false;
5449   };
5450
5451   // Extract constant bits from build vector.
5452   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5453     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5454     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5455
5456     APInt UndefSrcElts(NumSrcElts, 0);
5457     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5458     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5459       const SDValue &Src = Op.getOperand(i);
5460       if (Src.isUndef()) {
5461         UndefSrcElts.setBit(i);
5462         continue;
5463       }
5464       auto *Cst = cast<ConstantSDNode>(Src);
5465       SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5466     }
5467     return CastBitData(UndefSrcElts, SrcEltBits);
5468   }
5469
5470   // Extract constant bits from constant pool vector.
5471   if (auto *Cst = getTargetConstantFromNode(Op)) {
5472     Type *CstTy = Cst->getType();
5473     if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5474       return false;
5475
5476     unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5477     unsigned NumSrcElts = CstTy->getVectorNumElements();
5478
5479     APInt UndefSrcElts(NumSrcElts, 0);
5480     SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5481     for (unsigned i = 0; i != NumSrcElts; ++i)
5482       if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5483                                UndefSrcElts, i))
5484         return false;
5485
5486     return CastBitData(UndefSrcElts, SrcEltBits);
5487   }
5488
5489   // Extract constant bits from a broadcasted constant pool scalar.
5490   if (Op.getOpcode() == X86ISD::VBROADCAST &&
5491       EltSizeInBits <= VT.getScalarSizeInBits()) {
5492     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5493       unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5494       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5495
5496       APInt UndefSrcElts(NumSrcElts, 0);
5497       SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5498       if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5499         if (UndefSrcElts[0])
5500           UndefSrcElts.setBits(0, NumSrcElts);
5501         SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5502         return CastBitData(UndefSrcElts, SrcEltBits);
5503       }
5504     }
5505   }
5506
5507   // Extract a rematerialized scalar constant insertion.
5508   if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5509       Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5510       isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5511     unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5512     unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5513
5514     APInt UndefSrcElts(NumSrcElts, 0);
5515     SmallVector<APInt, 64> SrcEltBits;
5516     auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5517     SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5518     SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5519     return CastBitData(UndefSrcElts, SrcEltBits);
5520   }
5521
5522   return false;
5523 }
5524
5525 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5526                                         unsigned MaskEltSizeInBits,
5527                                         SmallVectorImpl<uint64_t> &RawMask) {
5528   APInt UndefElts;
5529   SmallVector<APInt, 64> EltBits;
5530
5531   // Extract the raw target constant bits.
5532   // FIXME: We currently don't support UNDEF bits or mask entries.
5533   if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5534                                      EltBits, /* AllowWholeUndefs */ false,
5535                                      /* AllowPartialUndefs */ false))
5536     return false;
5537
5538   // Insert the extracted elements into the mask.
5539   for (APInt Elt : EltBits)
5540     RawMask.push_back(Elt.getZExtValue());
5541
5542   return true;
5543 }
5544
5545 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5546 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5547 /// operands in \p Ops, and returns true.
5548 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5549 /// IsUnary for shuffles which use a single input multiple times, and in those
5550 /// cases it will adjust the mask to only have indices within that single input.
5551 /// It is an error to call this with non-empty Mask/Ops vectors.
5552 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5553                                  SmallVectorImpl<SDValue> &Ops,
5554                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
5555   unsigned NumElems = VT.getVectorNumElements();
5556   SDValue ImmN;
5557
5558   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5559   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5560
5561   IsUnary = false;
5562   bool IsFakeUnary = false;
5563   switch(N->getOpcode()) {
5564   case X86ISD::BLENDI:
5565     ImmN = N->getOperand(N->getNumOperands()-1);
5566     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5567     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5568     break;
5569   case X86ISD::SHUFP:
5570     ImmN = N->getOperand(N->getNumOperands()-1);
5571     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5572     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5573     break;
5574   case X86ISD::INSERTPS:
5575     ImmN = N->getOperand(N->getNumOperands()-1);
5576     DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5577     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5578     break;
5579   case X86ISD::EXTRQI:
5580     if (isa<ConstantSDNode>(N->getOperand(1)) &&
5581         isa<ConstantSDNode>(N->getOperand(2))) {
5582       int BitLen = N->getConstantOperandVal(1);
5583       int BitIdx = N->getConstantOperandVal(2);
5584       DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5585       IsUnary = true;
5586     }
5587     break;
5588   case X86ISD::INSERTQI:
5589     if (isa<ConstantSDNode>(N->getOperand(2)) &&
5590         isa<ConstantSDNode>(N->getOperand(3))) {
5591       int BitLen = N->getConstantOperandVal(2);
5592       int BitIdx = N->getConstantOperandVal(3);
5593       DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5594       IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5595     }
5596     break;
5597   case X86ISD::UNPCKH:
5598     DecodeUNPCKHMask(VT, Mask);
5599     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5600     break;
5601   case X86ISD::UNPCKL:
5602     DecodeUNPCKLMask(VT, Mask);
5603     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5604     break;
5605   case X86ISD::MOVHLPS:
5606     DecodeMOVHLPSMask(NumElems, Mask);
5607     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5608     break;
5609   case X86ISD::MOVLHPS:
5610     DecodeMOVLHPSMask(NumElems, Mask);
5611     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5612     break;
5613   case X86ISD::PALIGNR:
5614     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5615     ImmN = N->getOperand(N->getNumOperands()-1);
5616     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5617     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5618     Ops.push_back(N->getOperand(1));
5619     Ops.push_back(N->getOperand(0));
5620     break;
5621   case X86ISD::VSHLDQ:
5622     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5623     ImmN = N->getOperand(N->getNumOperands() - 1);
5624     DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5625     IsUnary = true;
5626     break;
5627   case X86ISD::VSRLDQ:
5628     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5629     ImmN = N->getOperand(N->getNumOperands() - 1);
5630     DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5631     IsUnary = true;
5632     break;
5633   case X86ISD::PSHUFD:
5634   case X86ISD::VPERMILPI:
5635     ImmN = N->getOperand(N->getNumOperands()-1);
5636     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5637     IsUnary = true;
5638     break;
5639   case X86ISD::PSHUFHW:
5640     ImmN = N->getOperand(N->getNumOperands()-1);
5641     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5642     IsUnary = true;
5643     break;
5644   case X86ISD::PSHUFLW:
5645     ImmN = N->getOperand(N->getNumOperands()-1);
5646     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5647     IsUnary = true;
5648     break;
5649   case X86ISD::VZEXT_MOVL:
5650     DecodeZeroMoveLowMask(VT, Mask);
5651     IsUnary = true;
5652     break;
5653   case X86ISD::VBROADCAST: {
5654     SDValue N0 = N->getOperand(0);
5655     // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5656     // add the pre-extracted value to the Ops vector.
5657     if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5658         N0.getOperand(0).getValueType() == VT &&
5659         N0.getConstantOperandVal(1) == 0)
5660       Ops.push_back(N0.getOperand(0));
5661
5662     // We only decode broadcasts of same-sized vectors, unless the broadcast
5663     // came from an extract from the original width. If we found one, we
5664     // pushed it the Ops vector above.
5665     if (N0.getValueType() == VT || !Ops.empty()) {
5666       DecodeVectorBroadcast(VT, Mask);
5667       IsUnary = true;
5668       break;
5669     }
5670     return false;
5671   }
5672   case X86ISD::VPERMILPV: {
5673     IsUnary = true;
5674     SDValue MaskNode = N->getOperand(1);
5675     unsigned MaskEltSize = VT.getScalarSizeInBits();
5676     SmallVector<uint64_t, 32> RawMask;
5677     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5678       DecodeVPERMILPMask(VT, RawMask, Mask);
5679       break;
5680     }
5681     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5682       DecodeVPERMILPMask(C, MaskEltSize, Mask);
5683       break;
5684     }
5685     return false;
5686   }
5687   case X86ISD::PSHUFB: {
5688     IsUnary = true;
5689     SDValue MaskNode = N->getOperand(1);
5690     SmallVector<uint64_t, 32> RawMask;
5691     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5692       DecodePSHUFBMask(RawMask, Mask);
5693       break;
5694     }
5695     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5696       DecodePSHUFBMask(C, Mask);
5697       break;
5698     }
5699     return false;
5700   }
5701   case X86ISD::VPERMI:
5702     ImmN = N->getOperand(N->getNumOperands()-1);
5703     DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5704     IsUnary = true;
5705     break;
5706   case X86ISD::MOVSS:
5707   case X86ISD::MOVSD:
5708     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5709     break;
5710   case X86ISD::VPERM2X128:
5711     ImmN = N->getOperand(N->getNumOperands()-1);
5712     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5713     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5714     break;
5715   case X86ISD::MOVSLDUP:
5716     DecodeMOVSLDUPMask(VT, Mask);
5717     IsUnary = true;
5718     break;
5719   case X86ISD::MOVSHDUP:
5720     DecodeMOVSHDUPMask(VT, Mask);
5721     IsUnary = true;
5722     break;
5723   case X86ISD::MOVDDUP:
5724     DecodeMOVDDUPMask(VT, Mask);
5725     IsUnary = true;
5726     break;
5727   case X86ISD::MOVLHPD:
5728   case X86ISD::MOVLPD:
5729   case X86ISD::MOVLPS:
5730     // Not yet implemented
5731     return false;
5732   case X86ISD::VPERMIL2: {
5733     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5734     unsigned MaskEltSize = VT.getScalarSizeInBits();
5735     SDValue MaskNode = N->getOperand(2);
5736     SDValue CtrlNode = N->getOperand(3);
5737     if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5738       unsigned CtrlImm = CtrlOp->getZExtValue();
5739       SmallVector<uint64_t, 32> RawMask;
5740       if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5741         DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5742         break;
5743       }
5744       if (auto *C = getTargetConstantFromNode(MaskNode)) {
5745         DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5746         break;
5747       }
5748     }
5749     return false;
5750   }
5751   case X86ISD::VPPERM: {
5752     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5753     SDValue MaskNode = N->getOperand(2);
5754     SmallVector<uint64_t, 32> RawMask;
5755     if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5756       DecodeVPPERMMask(RawMask, Mask);
5757       break;
5758     }
5759     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5760       DecodeVPPERMMask(C, Mask);
5761       break;
5762     }
5763     return false;
5764   }
5765   case X86ISD::VPERMV: {
5766     IsUnary = true;
5767     // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5768     Ops.push_back(N->getOperand(1));
5769     SDValue MaskNode = N->getOperand(0);
5770     SmallVector<uint64_t, 32> RawMask;
5771     unsigned MaskEltSize = VT.getScalarSizeInBits();
5772     if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5773       DecodeVPERMVMask(RawMask, Mask);
5774       break;
5775     }
5776     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5777       DecodeVPERMVMask(C, MaskEltSize, Mask);
5778       break;
5779     }
5780     return false;
5781   }
5782   case X86ISD::VPERMV3: {
5783     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5784     // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5785     Ops.push_back(N->getOperand(0));
5786     Ops.push_back(N->getOperand(2));
5787     SDValue MaskNode = N->getOperand(1);
5788     unsigned MaskEltSize = VT.getScalarSizeInBits();
5789     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5790       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5791       break;
5792     }
5793     return false;
5794   }
5795   case X86ISD::VPERMIV3: {
5796     IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5797     // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5798     Ops.push_back(N->getOperand(1));
5799     Ops.push_back(N->getOperand(2));
5800     SDValue MaskNode = N->getOperand(0);
5801     unsigned MaskEltSize = VT.getScalarSizeInBits();
5802     if (auto *C = getTargetConstantFromNode(MaskNode)) {
5803       DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5804       break;
5805     }
5806     return false;
5807   }
5808   default: llvm_unreachable("unknown target shuffle node");
5809   }
5810
5811   // Empty mask indicates the decode failed.
5812   if (Mask.empty())
5813     return false;
5814
5815   // Check if we're getting a shuffle mask with zero'd elements.
5816   if (!AllowSentinelZero)
5817     if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5818       return false;
5819
5820   // If we have a fake unary shuffle, the shuffle mask is spread across two
5821   // inputs that are actually the same node. Re-map the mask to always point
5822   // into the first input.
5823   if (IsFakeUnary)
5824     for (int &M : Mask)
5825       if (M >= (int)Mask.size())
5826         M -= Mask.size();
5827
5828   // If we didn't already add operands in the opcode-specific code, default to
5829   // adding 1 or 2 operands starting at 0.
5830   if (Ops.empty()) {
5831     Ops.push_back(N->getOperand(0));
5832     if (!IsUnary || IsFakeUnary)
5833       Ops.push_back(N->getOperand(1));
5834   }
5835
5836   return true;
5837 }
5838
5839 /// Check a target shuffle mask's inputs to see if we can set any values to
5840 /// SM_SentinelZero - this is for elements that are known to be zero
5841 /// (not just zeroable) from their inputs.
5842 /// Returns true if the target shuffle mask was decoded.
5843 static bool setTargetShuffleZeroElements(SDValue N,
5844                                          SmallVectorImpl<int> &Mask,
5845                                          SmallVectorImpl<SDValue> &Ops) {
5846   bool IsUnary;
5847   if (!isTargetShuffle(N.getOpcode()))
5848     return false;
5849
5850   MVT VT = N.getSimpleValueType();
5851   if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5852     return false;
5853
5854   SDValue V1 = Ops[0];
5855   SDValue V2 = IsUnary ? V1 : Ops[1];
5856
5857   V1 = peekThroughBitcasts(V1);
5858   V2 = peekThroughBitcasts(V2);
5859
5860   assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5861          "Illegal split of shuffle value type");
5862   unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5863
5864   // Extract known constant input data.
5865   APInt UndefSrcElts[2];
5866   SmallVector<APInt, 32> SrcEltBits[2];
5867   bool IsSrcConstant[2] = {
5868       getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5869                                     SrcEltBits[0], true, false),
5870       getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5871                                     SrcEltBits[1], true, false)};
5872
5873   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5874     int M = Mask[i];
5875
5876     // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5877     if (M < 0)
5878       continue;
5879
5880     // Determine shuffle input and normalize the mask.
5881     unsigned SrcIdx = M / Size;
5882     SDValue V = M < Size ? V1 : V2;
5883     M %= Size;
5884
5885     // We are referencing an UNDEF input.
5886     if (V.isUndef()) {
5887       Mask[i] = SM_SentinelUndef;
5888       continue;
5889     }
5890
5891     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5892     // TODO: We currently only set UNDEF for integer types - floats use the same
5893     // registers as vectors and many of the scalar folded loads rely on the
5894     // SCALAR_TO_VECTOR pattern.
5895     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5896         (Size % V.getValueType().getVectorNumElements()) == 0) {
5897       int Scale = Size / V.getValueType().getVectorNumElements();
5898       int Idx = M / Scale;
5899       if (Idx != 0 && !VT.isFloatingPoint())
5900         Mask[i] = SM_SentinelUndef;
5901       else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5902         Mask[i] = SM_SentinelZero;
5903       continue;
5904     }
5905
5906     // Attempt to extract from the source's constant bits.
5907     if (IsSrcConstant[SrcIdx]) {
5908       if (UndefSrcElts[SrcIdx][M])
5909         Mask[i] = SM_SentinelUndef;
5910       else if (SrcEltBits[SrcIdx][M] == 0)
5911         Mask[i] = SM_SentinelZero;
5912     }
5913   }
5914
5915   assert(VT.getVectorNumElements() == Mask.size() &&
5916          "Different mask size from vector size!");
5917   return true;
5918 }
5919
5920 // Attempt to decode ops that could be represented as a shuffle mask.
5921 // The decoded shuffle mask may contain a different number of elements to the
5922 // destination value type.
5923 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5924                                SmallVectorImpl<SDValue> &Ops,
5925                                SelectionDAG &DAG) {
5926   Mask.clear();
5927   Ops.clear();
5928
5929   MVT VT = N.getSimpleValueType();
5930   unsigned NumElts = VT.getVectorNumElements();
5931   unsigned NumSizeInBits = VT.getSizeInBits();
5932   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5933   assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5934          "Expected byte aligned value types");
5935
5936   unsigned Opcode = N.getOpcode();
5937   switch (Opcode) {
5938   case ISD::AND:
5939   case X86ISD::ANDNP: {
5940     // Attempt to decode as a per-byte mask.
5941     APInt UndefElts;
5942     SmallVector<APInt, 32> EltBits;
5943     SDValue N0 = N.getOperand(0);
5944     SDValue N1 = N.getOperand(1);
5945     bool IsAndN = (X86ISD::ANDNP == Opcode);
5946     uint64_t ZeroMask = IsAndN ? 255 : 0;
5947     if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5948       return false;
5949     for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5950       if (UndefElts[i]) {
5951         Mask.push_back(SM_SentinelUndef);
5952         continue;
5953       }
5954       uint64_t ByteBits = EltBits[i].getZExtValue();
5955       if (ByteBits != 0 && ByteBits != 255)
5956         return false;
5957       Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5958     }
5959     Ops.push_back(IsAndN ? N1 : N0);
5960     return true;
5961   }
5962   case ISD::SCALAR_TO_VECTOR: {
5963     // Match against a scalar_to_vector of an extract from a vector,
5964     // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5965     SDValue N0 = N.getOperand(0);
5966     SDValue SrcExtract;
5967
5968     if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5969         N0.getOperand(0).getValueType() == VT) {
5970       SrcExtract = N0;
5971     } else if (N0.getOpcode() == ISD::AssertZext &&
5972                N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5973                cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
5974       SrcExtract = N0.getOperand(0);
5975       assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
5976     } else if (N0.getOpcode() == ISD::AssertZext &&
5977                N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5978                cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
5979       SrcExtract = N0.getOperand(0);
5980       assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
5981     }
5982
5983     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5984       return false;
5985
5986     SDValue SrcVec = SrcExtract.getOperand(0);
5987     EVT SrcVT = SrcVec.getValueType();
5988     unsigned NumSrcElts = SrcVT.getVectorNumElements();
5989     unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5990
5991     unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5992     if (NumSrcElts <= SrcIdx)
5993       return false;
5994
5995     Ops.push_back(SrcVec);
5996     Mask.push_back(SrcIdx);
5997     Mask.append(NumZeros, SM_SentinelZero);
5998     Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5999     return true;
6000   }
6001   case X86ISD::PINSRB:
6002   case X86ISD::PINSRW: {
6003     SDValue InVec = N.getOperand(0);
6004     SDValue InScl = N.getOperand(1);
6005     uint64_t InIdx = N.getConstantOperandVal(2);
6006     assert(InIdx < NumElts && "Illegal insertion index");
6007
6008     // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6009     if (X86::isZeroNode(InScl)) {
6010       Ops.push_back(InVec);
6011       for (unsigned i = 0; i != NumElts; ++i)
6012         Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6013       return true;
6014     }
6015
6016     // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
6017     // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6018     unsigned ExOp =
6019         (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6020     if (InScl.getOpcode() != ISD::AssertZext ||
6021         InScl.getOperand(0).getOpcode() != ExOp)
6022       return false;
6023
6024     SDValue ExVec = InScl.getOperand(0).getOperand(0);
6025     uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
6026     assert(ExIdx < NumElts && "Illegal extraction index");
6027     Ops.push_back(InVec);
6028     Ops.push_back(ExVec);
6029     for (unsigned i = 0; i != NumElts; ++i)
6030       Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6031     return true;
6032   }
6033   case X86ISD::PACKSS: {
6034     // If we know input saturation won't happen we can treat this
6035     // as a truncation shuffle.
6036     if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
6037         DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
6038       return false;
6039
6040     Ops.push_back(N.getOperand(0));
6041     Ops.push_back(N.getOperand(1));
6042     for (unsigned i = 0; i != NumElts; ++i)
6043       Mask.push_back(i * 2);
6044     return true;
6045   }
6046   case X86ISD::VSHLI:
6047   case X86ISD::VSRLI: {
6048     uint64_t ShiftVal = N.getConstantOperandVal(1);
6049     // Out of range bit shifts are guaranteed to be zero.
6050     if (NumBitsPerElt <= ShiftVal) {
6051       Mask.append(NumElts, SM_SentinelZero);
6052       return true;
6053     }
6054
6055     // We can only decode 'whole byte' bit shifts as shuffles.
6056     if ((ShiftVal % 8) != 0)
6057       break;
6058
6059     uint64_t ByteShift = ShiftVal / 8;
6060     unsigned NumBytes = NumSizeInBits / 8;
6061     unsigned NumBytesPerElt = NumBitsPerElt / 8;
6062     Ops.push_back(N.getOperand(0));
6063
6064     // Clear mask to all zeros and insert the shifted byte indices.
6065     Mask.append(NumBytes, SM_SentinelZero);
6066
6067     if (X86ISD::VSHLI == Opcode) {
6068       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6069         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6070           Mask[i + j] = i + j - ByteShift;
6071     } else {
6072       for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6073         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6074           Mask[i + j - ByteShift] = i + j;
6075     }
6076     return true;
6077   }
6078   case ISD::ZERO_EXTEND_VECTOR_INREG:
6079   case X86ISD::VZEXT: {
6080     // TODO - add support for VPMOVZX with smaller input vector types.
6081     SDValue Src = N.getOperand(0);
6082     MVT SrcVT = Src.getSimpleValueType();
6083     if (NumSizeInBits != SrcVT.getSizeInBits())
6084       break;
6085     DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6086     Ops.push_back(Src);
6087     return true;
6088   }
6089   }
6090
6091   return false;
6092 }
6093
6094 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6095 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6096                                               SmallVectorImpl<int> &Mask) {
6097   int MaskWidth = Mask.size();
6098   SmallVector<SDValue, 16> UsedInputs;
6099   for (int i = 0, e = Inputs.size(); i < e; ++i) {
6100     int lo = UsedInputs.size() * MaskWidth;
6101     int hi = lo + MaskWidth;
6102     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6103       UsedInputs.push_back(Inputs[i]);
6104       continue;
6105     }
6106     for (int &M : Mask)
6107       if (lo <= M)
6108         M -= MaskWidth;
6109   }
6110   Inputs = UsedInputs;
6111 }
6112
6113 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6114 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6115 /// remaining input indices in case we now have a unary shuffle and adjust the
6116 /// inputs accordingly.
6117 /// Returns true if the target shuffle mask was decoded.
6118 static bool resolveTargetShuffleInputs(SDValue Op,
6119                                        SmallVectorImpl<SDValue> &Inputs,
6120                                        SmallVectorImpl<int> &Mask,
6121                                        SelectionDAG &DAG) {
6122   if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6123     if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6124       return false;
6125
6126   resolveTargetShuffleInputsAndMask(Inputs, Mask);
6127   return true;
6128 }
6129
6130 /// Returns the scalar element that will make up the ith
6131 /// element of the result of the vector shuffle.
6132 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6133                                    unsigned Depth) {
6134   if (Depth == 6)
6135     return SDValue();  // Limit search depth.
6136
6137   SDValue V = SDValue(N, 0);
6138   EVT VT = V.getValueType();
6139   unsigned Opcode = V.getOpcode();
6140
6141   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6142   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6143     int Elt = SV->getMaskElt(Index);
6144
6145     if (Elt < 0)
6146       return DAG.getUNDEF(VT.getVectorElementType());
6147
6148     unsigned NumElems = VT.getVectorNumElements();
6149     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6150                                          : SV->getOperand(1);
6151     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6152   }
6153
6154   // Recurse into target specific vector shuffles to find scalars.
6155   if (isTargetShuffle(Opcode)) {
6156     MVT ShufVT = V.getSimpleValueType();
6157     MVT ShufSVT = ShufVT.getVectorElementType();
6158     int NumElems = (int)ShufVT.getVectorNumElements();
6159     SmallVector<int, 16> ShuffleMask;
6160     SmallVector<SDValue, 16> ShuffleOps;
6161     bool IsUnary;
6162
6163     if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6164       return SDValue();
6165
6166     int Elt = ShuffleMask[Index];
6167     if (Elt == SM_SentinelZero)
6168       return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6169                                  : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6170     if (Elt == SM_SentinelUndef)
6171       return DAG.getUNDEF(ShufSVT);
6172
6173     assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6174     SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6175     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6176                                Depth+1);
6177   }
6178
6179   // Actual nodes that may contain scalar elements
6180   if (Opcode == ISD::BITCAST) {
6181     V = V.getOperand(0);
6182     EVT SrcVT = V.getValueType();
6183     unsigned NumElems = VT.getVectorNumElements();
6184
6185     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6186       return SDValue();
6187   }
6188
6189   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6190     return (Index == 0) ? V.getOperand(0)
6191                         : DAG.getUNDEF(VT.getVectorElementType());
6192
6193   if (V.getOpcode() == ISD::BUILD_VECTOR)
6194     return V.getOperand(Index);
6195
6196   return SDValue();
6197 }
6198
6199 /// Custom lower build_vector of v16i8.
6200 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6201                                      unsigned NumNonZero, unsigned NumZero,
6202                                      SelectionDAG &DAG,
6203                                      const X86Subtarget &Subtarget) {
6204   if (NumNonZero > 8 && !Subtarget.hasSSE41())
6205     return SDValue();
6206
6207   SDLoc dl(Op);
6208   SDValue V;
6209   bool First = true;
6210
6211   // SSE4.1 - use PINSRB to insert each byte directly.
6212   if (Subtarget.hasSSE41()) {
6213     for (unsigned i = 0; i < 16; ++i) {
6214       bool IsNonZero = (NonZeros & (1 << i)) != 0;
6215       if (IsNonZero) {
6216         // If the build vector contains zeros or our first insertion is not the
6217         // first index then insert into zero vector to break any register
6218         // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6219         if (First) {
6220           First = false;
6221           if (NumZero || 0 != i)
6222             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6223           else {
6224             assert(0 == i && "Expected insertion into zero-index");
6225             V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6226             V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6227             V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6228             V = DAG.getBitcast(MVT::v16i8, V);
6229             continue;
6230           }
6231         }
6232         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6233                         Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6234       }
6235     }
6236
6237     return V;
6238   }
6239
6240   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6241   for (unsigned i = 0; i < 16; ++i) {
6242     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6243     if (ThisIsNonZero && First) {
6244       if (NumZero)
6245         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6246       else
6247         V = DAG.getUNDEF(MVT::v8i16);
6248       First = false;
6249     }
6250
6251     if ((i & 1) != 0) {
6252       // FIXME: Investigate extending to i32 instead of just i16.
6253       // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6254       SDValue ThisElt, LastElt;
6255       bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6256       if (LastIsNonZero) {
6257         LastElt =
6258             DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6259       }
6260       if (ThisIsNonZero) {
6261         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6262         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6263                               DAG.getConstant(8, dl, MVT::i8));
6264         if (LastIsNonZero)
6265           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6266       } else
6267         ThisElt = LastElt;
6268
6269       if (ThisElt) {
6270         if (1 == i) {
6271           V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6272                       : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6273           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6274           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6275           V = DAG.getBitcast(MVT::v8i16, V);
6276         } else {
6277           V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6278                           DAG.getIntPtrConstant(i / 2, dl));
6279         }
6280       }
6281     }
6282   }
6283
6284   return DAG.getBitcast(MVT::v16i8, V);
6285 }
6286
6287 /// Custom lower build_vector of v8i16.
6288 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6289                                      unsigned NumNonZero, unsigned NumZero,
6290                                      SelectionDAG &DAG,
6291                                      const X86Subtarget &Subtarget) {
6292   if (NumNonZero > 4 && !Subtarget.hasSSE41())
6293     return SDValue();
6294
6295   SDLoc dl(Op);
6296   SDValue V;
6297   bool First = true;
6298   for (unsigned i = 0; i < 8; ++i) {
6299     bool IsNonZero = (NonZeros & (1 << i)) != 0;
6300     if (IsNonZero) {
6301       // If the build vector contains zeros or our first insertion is not the
6302       // first index then insert into zero vector to break any register
6303       // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6304       if (First) {
6305         First = false;
6306         if (NumZero || 0 != i)
6307           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6308         else {
6309           assert(0 == i && "Expected insertion into zero-index");
6310           V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6311           V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6312           V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6313           V = DAG.getBitcast(MVT::v8i16, V);
6314           continue;
6315         }
6316       }
6317       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6318                       Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6319     }
6320   }
6321
6322   return V;
6323 }
6324
6325 /// Custom lower build_vector of v4i32 or v4f32.
6326 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6327                                      const X86Subtarget &Subtarget) {
6328   // Find all zeroable elements.
6329   std::bitset<4> Zeroable;
6330   for (int i=0; i < 4; ++i) {
6331     SDValue Elt = Op->getOperand(i);
6332     Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6333   }
6334   assert(Zeroable.size() - Zeroable.count() > 1 &&
6335          "We expect at least two non-zero elements!");
6336
6337   // We only know how to deal with build_vector nodes where elements are either
6338   // zeroable or extract_vector_elt with constant index.
6339   SDValue FirstNonZero;
6340   unsigned FirstNonZeroIdx;
6341   for (unsigned i=0; i < 4; ++i) {
6342     if (Zeroable[i])
6343       continue;
6344     SDValue Elt = Op->getOperand(i);
6345     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6346         !isa<ConstantSDNode>(Elt.getOperand(1)))
6347       return SDValue();
6348     // Make sure that this node is extracting from a 128-bit vector.
6349     MVT VT = Elt.getOperand(0).getSimpleValueType();
6350     if (!VT.is128BitVector())
6351       return SDValue();
6352     if (!FirstNonZero.getNode()) {
6353       FirstNonZero = Elt;
6354       FirstNonZeroIdx = i;
6355     }
6356   }
6357
6358   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6359   SDValue V1 = FirstNonZero.getOperand(0);
6360   MVT VT = V1.getSimpleValueType();
6361
6362   // See if this build_vector can be lowered as a blend with zero.
6363   SDValue Elt;
6364   unsigned EltMaskIdx, EltIdx;
6365   int Mask[4];
6366   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6367     if (Zeroable[EltIdx]) {
6368       // The zero vector will be on the right hand side.
6369       Mask[EltIdx] = EltIdx+4;
6370       continue;
6371     }
6372
6373     Elt = Op->getOperand(EltIdx);
6374     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6375     EltMaskIdx = Elt.getConstantOperandVal(1);
6376     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6377       break;
6378     Mask[EltIdx] = EltIdx;
6379   }
6380
6381   if (EltIdx == 4) {
6382     // Let the shuffle legalizer deal with blend operations.
6383     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6384     if (V1.getSimpleValueType() != VT)
6385       V1 = DAG.getBitcast(VT, V1);
6386     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6387   }
6388
6389   // See if we can lower this build_vector to a INSERTPS.
6390   if (!Subtarget.hasSSE41())
6391     return SDValue();
6392
6393   SDValue V2 = Elt.getOperand(0);
6394   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6395     V1 = SDValue();
6396
6397   bool CanFold = true;
6398   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6399     if (Zeroable[i])
6400       continue;
6401
6402     SDValue Current = Op->getOperand(i);
6403     SDValue SrcVector = Current->getOperand(0);
6404     if (!V1.getNode())
6405       V1 = SrcVector;
6406     CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6407   }
6408
6409   if (!CanFold)
6410     return SDValue();
6411
6412   assert(V1.getNode() && "Expected at least two non-zero elements!");
6413   if (V1.getSimpleValueType() != MVT::v4f32)
6414     V1 = DAG.getBitcast(MVT::v4f32, V1);
6415   if (V2.getSimpleValueType() != MVT::v4f32)
6416     V2 = DAG.getBitcast(MVT::v4f32, V2);
6417
6418   // Ok, we can emit an INSERTPS instruction.
6419   unsigned ZMask = Zeroable.to_ulong();
6420
6421   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6422   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6423   SDLoc DL(Op);
6424   SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6425                                DAG.getIntPtrConstant(InsertPSMask, DL));
6426   return DAG.getBitcast(VT, Result);
6427 }
6428
6429 /// Return a vector logical shift node.
6430 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6431                          SelectionDAG &DAG, const TargetLowering &TLI,
6432                          const SDLoc &dl) {
6433   assert(VT.is128BitVector() && "Unknown type for VShift");
6434   MVT ShVT = MVT::v16i8;
6435   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6436   SrcOp = DAG.getBitcast(ShVT, SrcOp);
6437   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6438   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6439   SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6440   return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6441 }
6442
6443 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6444                                       SelectionDAG &DAG) {
6445
6446   // Check if the scalar load can be widened into a vector load. And if
6447   // the address is "base + cst" see if the cst can be "absorbed" into
6448   // the shuffle mask.
6449   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6450     SDValue Ptr = LD->getBasePtr();
6451     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6452       return SDValue();
6453     EVT PVT = LD->getValueType(0);
6454     if (PVT != MVT::i32 && PVT != MVT::f32)
6455       return SDValue();
6456
6457     int FI = -1;
6458     int64_t Offset = 0;
6459     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6460       FI = FINode->getIndex();
6461       Offset = 0;
6462     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6463                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6464       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6465       Offset = Ptr.getConstantOperandVal(1);
6466       Ptr = Ptr.getOperand(0);
6467     } else {
6468       return SDValue();
6469     }
6470
6471     // FIXME: 256-bit vector instructions don't require a strict alignment,
6472     // improve this code to support it better.
6473     unsigned RequiredAlign = VT.getSizeInBits()/8;
6474     SDValue Chain = LD->getChain();
6475     // Make sure the stack object alignment is at least 16 or 32.
6476     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6477     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6478       if (MFI.isFixedObjectIndex(FI)) {
6479         // Can't change the alignment. FIXME: It's possible to compute
6480         // the exact stack offset and reference FI + adjust offset instead.
6481         // If someone *really* cares about this. That's the way to implement it.
6482         return SDValue();
6483       } else {
6484         MFI.setObjectAlignment(FI, RequiredAlign);
6485       }
6486     }
6487
6488     // (Offset % 16 or 32) must be multiple of 4. Then address is then
6489     // Ptr + (Offset & ~15).
6490     if (Offset < 0)
6491       return SDValue();
6492     if ((Offset % RequiredAlign) & 3)
6493       return SDValue();
6494     int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6495     if (StartOffset) {
6496       SDLoc DL(Ptr);
6497       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6498                         DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6499     }
6500
6501     int EltNo = (Offset - StartOffset) >> 2;
6502     unsigned NumElems = VT.getVectorNumElements();
6503
6504     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6505     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6506                              LD->getPointerInfo().getWithOffset(StartOffset));
6507
6508     SmallVector<int, 8> Mask(NumElems, EltNo);
6509
6510     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6511   }
6512
6513   return SDValue();
6514 }
6515
6516 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6517 /// elements can be replaced by a single large load which has the same value as
6518 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6519 ///
6520 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6521 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6522                                         const SDLoc &DL, SelectionDAG &DAG,
6523                                         const X86Subtarget &Subtarget,
6524                                         bool isAfterLegalize) {
6525   unsigned NumElems = Elts.size();
6526
6527   int LastLoadedElt = -1;
6528   SmallBitVector LoadMask(NumElems, false);
6529   SmallBitVector ZeroMask(NumElems, false);
6530   SmallBitVector UndefMask(NumElems, false);
6531
6532   // For each element in the initializer, see if we've found a load, zero or an
6533   // undef.
6534   for (unsigned i = 0; i < NumElems; ++i) {
6535     SDValue Elt = peekThroughBitcasts(Elts[i]);
6536     if (!Elt.getNode())
6537       return SDValue();
6538
6539     if (Elt.isUndef())
6540       UndefMask[i] = true;
6541     else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6542       ZeroMask[i] = true;
6543     else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6544       LoadMask[i] = true;
6545       LastLoadedElt = i;
6546       // Each loaded element must be the correct fractional portion of the
6547       // requested vector load.
6548       if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6549         return SDValue();
6550     } else
6551       return SDValue();
6552   }
6553   assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6554          "Incomplete element masks");
6555
6556   // Handle Special Cases - all undef or undef/zero.
6557   if (UndefMask.count() == NumElems)
6558     return DAG.getUNDEF(VT);
6559
6560   // FIXME: Should we return this as a BUILD_VECTOR instead?
6561   if ((ZeroMask | UndefMask).count() == NumElems)
6562     return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6563                           : DAG.getConstantFP(0.0, DL, VT);
6564
6565   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6566   int FirstLoadedElt = LoadMask.find_first();
6567   SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6568   LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6569   EVT LDBaseVT = EltBase.getValueType();
6570
6571   // Consecutive loads can contain UNDEFS but not ZERO elements.
6572   // Consecutive loads with UNDEFs and ZEROs elements require a
6573   // an additional shuffle stage to clear the ZERO elements.
6574   bool IsConsecutiveLoad = true;
6575   bool IsConsecutiveLoadWithZeros = true;
6576   for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6577     if (LoadMask[i]) {
6578       SDValue Elt = peekThroughBitcasts(Elts[i]);
6579       LoadSDNode *LD = cast<LoadSDNode>(Elt);
6580       if (!DAG.areNonVolatileConsecutiveLoads(
6581               LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6582               i - FirstLoadedElt)) {
6583         IsConsecutiveLoad = false;
6584         IsConsecutiveLoadWithZeros = false;
6585         break;
6586       }
6587     } else if (ZeroMask[i]) {
6588       IsConsecutiveLoad = false;
6589     }
6590   }
6591
6592   auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6593     auto MMOFlags = LDBase->getMemOperand()->getFlags();
6594     assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6595            "Cannot merge volatile loads.");
6596     SDValue NewLd =
6597         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6598                     LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6599     DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6600     return NewLd;
6601   };
6602
6603   // LOAD - all consecutive load/undefs (must start/end with a load).
6604   // If we have found an entire vector of loads and undefs, then return a large
6605   // load of the entire vector width starting at the base pointer.
6606   // If the vector contains zeros, then attempt to shuffle those elements.
6607   if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6608       (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6609     assert(LDBase && "Did not find base load for merging consecutive loads");
6610     EVT EltVT = LDBase->getValueType(0);
6611     // Ensure that the input vector size for the merged loads matches the
6612     // cumulative size of the input elements.
6613     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6614       return SDValue();
6615
6616     if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6617       return SDValue();
6618
6619     // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6620     // will lower to regular temporal loads and use the cache.
6621     if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6622         VT.is256BitVector() && !Subtarget.hasInt256())
6623       return SDValue();
6624
6625     if (IsConsecutiveLoad)
6626       return CreateLoad(VT, LDBase);
6627
6628     // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6629     // vector and a zero vector to clear out the zero elements.
6630     if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6631       SmallVector<int, 4> ClearMask(NumElems, -1);
6632       for (unsigned i = 0; i < NumElems; ++i) {
6633         if (ZeroMask[i])
6634           ClearMask[i] = i + NumElems;
6635         else if (LoadMask[i])
6636           ClearMask[i] = i;
6637       }
6638       SDValue V = CreateLoad(VT, LDBase);
6639       SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6640                                  : DAG.getConstantFP(0.0, DL, VT);
6641       return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6642     }
6643   }
6644
6645   int LoadSize =
6646       (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6647
6648   // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6649   if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6650       (LoadSize == 32 || LoadSize == 64) &&
6651       ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6652     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6653                                       : MVT::getIntegerVT(LoadSize);
6654     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6655     if (TLI.isTypeLegal(VecVT)) {
6656       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6657       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6658       SDValue ResNode =
6659           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6660                                   LDBase->getPointerInfo(),
6661                                   LDBase->getAlignment(),
6662                                   false/*isVolatile*/, true/*ReadMem*/,
6663                                   false/*WriteMem*/);
6664       DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6665       return DAG.getBitcast(VT, ResNode);
6666     }
6667   }
6668
6669   return SDValue();
6670 }
6671
6672 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6673                                    unsigned SplatBitSize, LLVMContext &C) {
6674   unsigned ScalarSize = VT.getScalarSizeInBits();
6675   unsigned NumElm = SplatBitSize / ScalarSize;
6676
6677   SmallVector<Constant *, 32> ConstantVec;
6678   for (unsigned i = 0; i < NumElm; i++) {
6679     APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6680     Constant *Const;
6681     if (VT.isFloatingPoint()) {
6682       if (ScalarSize == 32) {
6683         Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6684       } else {
6685         assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6686         Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6687       }
6688     } else
6689       Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6690     ConstantVec.push_back(Const);
6691   }
6692   return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6693 }
6694
6695 static bool isUseOfShuffle(SDNode *N) {
6696   for (auto *U : N->uses()) {
6697     if (isTargetShuffle(U->getOpcode()))
6698       return true;
6699     if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6700       return isUseOfShuffle(U);
6701   }
6702   return false;
6703 }
6704
6705 /// Attempt to use the vbroadcast instruction to generate a splat value
6706 /// from a splat BUILD_VECTOR which uses:
6707 ///  a. A single scalar load, or a constant.
6708 ///  b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6709 ///
6710 /// The VBROADCAST node is returned when a pattern is found,
6711 /// or SDValue() otherwise.
6712 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6713                                            const X86Subtarget &Subtarget,
6714                                            SelectionDAG &DAG) {
6715   // VBROADCAST requires AVX.
6716   // TODO: Splats could be generated for non-AVX CPUs using SSE
6717   // instructions, but there's less potential gain for only 128-bit vectors.
6718   if (!Subtarget.hasAVX())
6719     return SDValue();
6720
6721   MVT VT = BVOp->getSimpleValueType(0);
6722   SDLoc dl(BVOp);
6723
6724   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6725          "Unsupported vector type for broadcast.");
6726
6727   BitVector UndefElements;
6728   SDValue Ld = BVOp->getSplatValue(&UndefElements);
6729
6730   // We need a splat of a single value to use broadcast, and it doesn't
6731   // make any sense if the value is only in one element of the vector.
6732   if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6733     APInt SplatValue, Undef;
6734     unsigned SplatBitSize;
6735     bool HasUndef;
6736     // Check if this is a repeated constant pattern suitable for broadcasting.
6737     if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6738         SplatBitSize > VT.getScalarSizeInBits() &&
6739         SplatBitSize < VT.getSizeInBits()) {
6740       // Avoid replacing with broadcast when it's a use of a shuffle
6741       // instruction to preserve the present custom lowering of shuffles.
6742       if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6743         return SDValue();
6744       // replace BUILD_VECTOR with broadcast of the repeated constants.
6745       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6746       LLVMContext *Ctx = DAG.getContext();
6747       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6748       if (Subtarget.hasAVX()) {
6749         if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6750             !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6751           // Splatted value can fit in one INTEGER constant in constant pool.
6752           // Load the constant and broadcast it.
6753           MVT CVT = MVT::getIntegerVT(SplatBitSize);
6754           Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6755           Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6756           SDValue CP = DAG.getConstantPool(C, PVT);
6757           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6758
6759           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6760           Ld = DAG.getLoad(
6761               CVT, dl, DAG.getEntryNode(), CP,
6762               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6763               Alignment);
6764           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6765                                        MVT::getVectorVT(CVT, Repeat), Ld);
6766           return DAG.getBitcast(VT, Brdcst);
6767         } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6768           // Splatted value can fit in one FLOAT constant in constant pool.
6769           // Load the constant and broadcast it.
6770           // AVX have support for 32 and 64 bit broadcast for floats only.
6771           // No 64bit integer in 32bit subtarget.
6772           MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6773           // Lower the splat via APFloat directly, to avoid any conversion.
6774           Constant *C =
6775               SplatBitSize == 32
6776                   ? ConstantFP::get(*Ctx,
6777                                     APFloat(APFloat::IEEEsingle(), SplatValue))
6778                   : ConstantFP::get(*Ctx,
6779                                     APFloat(APFloat::IEEEdouble(), SplatValue));
6780           SDValue CP = DAG.getConstantPool(C, PVT);
6781           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6782
6783           unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6784           Ld = DAG.getLoad(
6785               CVT, dl, DAG.getEntryNode(), CP,
6786               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6787               Alignment);
6788           SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6789                                        MVT::getVectorVT(CVT, Repeat), Ld);
6790           return DAG.getBitcast(VT, Brdcst);
6791         } else if (SplatBitSize > 64) {
6792           // Load the vector of constants and broadcast it.
6793           MVT CVT = VT.getScalarType();
6794           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6795                                              *Ctx);
6796           SDValue VCP = DAG.getConstantPool(VecC, PVT);
6797           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6798           unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6799           Ld = DAG.getLoad(
6800               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6801               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6802               Alignment);
6803           SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6804           return DAG.getBitcast(VT, Brdcst);
6805         }
6806       }
6807     }
6808     return SDValue();
6809   }
6810
6811   bool ConstSplatVal =
6812       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6813
6814   // Make sure that all of the users of a non-constant load are from the
6815   // BUILD_VECTOR node.
6816   if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6817     return SDValue();
6818
6819   unsigned ScalarSize = Ld.getValueSizeInBits();
6820   bool IsGE256 = (VT.getSizeInBits() >= 256);
6821
6822   // When optimizing for size, generate up to 5 extra bytes for a broadcast
6823   // instruction to save 8 or more bytes of constant pool data.
6824   // TODO: If multiple splats are generated to load the same constant,
6825   // it may be detrimental to overall size. There needs to be a way to detect
6826   // that condition to know if this is truly a size win.
6827   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6828
6829   // Handle broadcasting a single constant scalar from the constant pool
6830   // into a vector.
6831   // On Sandybridge (no AVX2), it is still better to load a constant vector
6832   // from the constant pool and not to broadcast it from a scalar.
6833   // But override that restriction when optimizing for size.
6834   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6835   if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6836     EVT CVT = Ld.getValueType();
6837     assert(!CVT.isVector() && "Must not broadcast a vector type");
6838
6839     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6840     // For size optimization, also splat v2f64 and v2i64, and for size opt
6841     // with AVX2, also splat i8 and i16.
6842     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6843     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6844         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6845       const Constant *C = nullptr;
6846       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6847         C = CI->getConstantIntValue();
6848       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6849         C = CF->getConstantFPValue();
6850
6851       assert(C && "Invalid constant type");
6852
6853       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6854       SDValue CP =
6855           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6856       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6857       Ld = DAG.getLoad(
6858           CVT, dl, DAG.getEntryNode(), CP,
6859           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6860           Alignment);
6861
6862       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6863     }
6864   }
6865
6866   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6867
6868   // Handle AVX2 in-register broadcasts.
6869   if (!IsLoad && Subtarget.hasInt256() &&
6870       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6871     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6872
6873   // The scalar source must be a normal load.
6874   if (!IsLoad)
6875     return SDValue();
6876
6877   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6878       (Subtarget.hasVLX() && ScalarSize == 64))
6879     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6880
6881   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6882   // double since there is no vbroadcastsd xmm
6883   if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6884     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6885       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6886   }
6887
6888   // Unsupported broadcast.
6889   return SDValue();
6890 }
6891
6892 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6893 /// underlying vector and index.
6894 ///
6895 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6896 /// index.
6897 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6898                                          SDValue ExtIdx) {
6899   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6900   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6901     return Idx;
6902
6903   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6904   // lowered this:
6905   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6906   // to:
6907   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
6908   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
6909   //                           undef)
6910   //                       Constant<0>)
6911   // In this case the vector is the extract_subvector expression and the index
6912   // is 2, as specified by the shuffle.
6913   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6914   SDValue ShuffleVec = SVOp->getOperand(0);
6915   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6916   assert(ShuffleVecVT.getVectorElementType() ==
6917          ExtractedFromVec.getSimpleValueType().getVectorElementType());
6918
6919   int ShuffleIdx = SVOp->getMaskElt(Idx);
6920   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6921     ExtractedFromVec = ShuffleVec;
6922     return ShuffleIdx;
6923   }
6924   return Idx;
6925 }
6926
6927 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6928   MVT VT = Op.getSimpleValueType();
6929
6930   // Skip if insert_vec_elt is not supported.
6931   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6932   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6933     return SDValue();
6934
6935   SDLoc DL(Op);
6936   unsigned NumElems = Op.getNumOperands();
6937
6938   SDValue VecIn1;
6939   SDValue VecIn2;
6940   SmallVector<unsigned, 4> InsertIndices;
6941   SmallVector<int, 8> Mask(NumElems, -1);
6942
6943   for (unsigned i = 0; i != NumElems; ++i) {
6944     unsigned Opc = Op.getOperand(i).getOpcode();
6945
6946     if (Opc == ISD::UNDEF)
6947       continue;
6948
6949     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6950       // Quit if more than 1 elements need inserting.
6951       if (InsertIndices.size() > 1)
6952         return SDValue();
6953
6954       InsertIndices.push_back(i);
6955       continue;
6956     }
6957
6958     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6959     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6960
6961     // Quit if non-constant index.
6962     if (!isa<ConstantSDNode>(ExtIdx))
6963       return SDValue();
6964     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6965
6966     // Quit if extracted from vector of different type.
6967     if (ExtractedFromVec.getValueType() != VT)
6968       return SDValue();
6969
6970     if (!VecIn1.getNode())
6971       VecIn1 = ExtractedFromVec;
6972     else if (VecIn1 != ExtractedFromVec) {
6973       if (!VecIn2.getNode())
6974         VecIn2 = ExtractedFromVec;
6975       else if (VecIn2 != ExtractedFromVec)
6976         // Quit if more than 2 vectors to shuffle
6977         return SDValue();
6978     }
6979
6980     if (ExtractedFromVec == VecIn1)
6981       Mask[i] = Idx;
6982     else if (ExtractedFromVec == VecIn2)
6983       Mask[i] = Idx + NumElems;
6984   }
6985
6986   if (!VecIn1.getNode())
6987     return SDValue();
6988
6989   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6990   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6991
6992   for (unsigned Idx : InsertIndices)
6993     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6994                      DAG.getIntPtrConstant(Idx, DL));
6995
6996   return NV;
6997 }
6998
6999 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7000   assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7001          Op.getScalarValueSizeInBits() == 1 &&
7002          "Can not convert non-constant vector");
7003   uint64_t Immediate = 0;
7004   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7005     SDValue In = Op.getOperand(idx);
7006     if (!In.isUndef())
7007       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7008   }
7009   SDLoc dl(Op);
7010   MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7011   return DAG.getConstant(Immediate, dl, VT);
7012 }
7013 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7014 SDValue
7015 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7016
7017   MVT VT = Op.getSimpleValueType();
7018   assert((VT.getVectorElementType() == MVT::i1) &&
7019          "Unexpected type in LowerBUILD_VECTORvXi1!");
7020
7021   SDLoc dl(Op);
7022   if (ISD::isBuildVectorAllZeros(Op.getNode()))
7023     return DAG.getTargetConstant(0, dl, VT);
7024
7025   if (ISD::isBuildVectorAllOnes(Op.getNode()))
7026     return DAG.getTargetConstant(1, dl, VT);
7027
7028   if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7029     if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7030       // Split the pieces.
7031       SDValue Lower =
7032           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7033       SDValue Upper =
7034           DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7035       // We have to manually lower both halves so getNode doesn't try to
7036       // reassemble the build_vector.
7037       Lower = LowerBUILD_VECTORvXi1(Lower, DAG);
7038       Upper = LowerBUILD_VECTORvXi1(Upper, DAG);
7039       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7040     }
7041     SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7042     if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7043       return DAG.getBitcast(VT, Imm);
7044     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7045     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7046                         DAG.getIntPtrConstant(0, dl));
7047   }
7048
7049   // Vector has one or more non-const elements
7050   uint64_t Immediate = 0;
7051   SmallVector<unsigned, 16> NonConstIdx;
7052   bool IsSplat = true;
7053   bool HasConstElts = false;
7054   int SplatIdx = -1;
7055   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7056     SDValue In = Op.getOperand(idx);
7057     if (In.isUndef())
7058       continue;
7059     if (!isa<ConstantSDNode>(In))
7060       NonConstIdx.push_back(idx);
7061     else {
7062       Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7063       HasConstElts = true;
7064     }
7065     if (SplatIdx < 0)
7066       SplatIdx = idx;
7067     else if (In != Op.getOperand(SplatIdx))
7068       IsSplat = false;
7069   }
7070
7071   // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7072   if (IsSplat)
7073     return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7074                          DAG.getConstant(1, dl, VT),
7075                          DAG.getConstant(0, dl, VT));
7076
7077   // insert elements one by one
7078   SDValue DstVec;
7079   SDValue Imm;
7080   if (Immediate) {
7081     MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7082     Imm = DAG.getConstant(Immediate, dl, ImmVT);
7083   }
7084   else if (HasConstElts)
7085     Imm = DAG.getConstant(0, dl, VT);
7086   else
7087     Imm = DAG.getUNDEF(VT);
7088   if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7089     DstVec = DAG.getBitcast(VT, Imm);
7090   else {
7091     SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7092     DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7093                          DAG.getIntPtrConstant(0, dl));
7094   }
7095
7096   for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7097     unsigned InsertIdx = NonConstIdx[i];
7098     DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7099                          Op.getOperand(InsertIdx),
7100                          DAG.getIntPtrConstant(InsertIdx, dl));
7101   }
7102   return DstVec;
7103 }
7104
7105 /// \brief Return true if \p N implements a horizontal binop and return the
7106 /// operands for the horizontal binop into V0 and V1.
7107 ///
7108 /// This is a helper function of LowerToHorizontalOp().
7109 /// This function checks that the build_vector \p N in input implements a
7110 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7111 /// operation to match.
7112 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7113 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7114 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7115 /// arithmetic sub.
7116 ///
7117 /// This function only analyzes elements of \p N whose indices are
7118 /// in range [BaseIdx, LastIdx).
7119 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7120                               SelectionDAG &DAG,
7121                               unsigned BaseIdx, unsigned LastIdx,
7122                               SDValue &V0, SDValue &V1) {
7123   EVT VT = N->getValueType(0);
7124
7125   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7126   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7127          "Invalid Vector in input!");
7128
7129   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7130   bool CanFold = true;
7131   unsigned ExpectedVExtractIdx = BaseIdx;
7132   unsigned NumElts = LastIdx - BaseIdx;
7133   V0 = DAG.getUNDEF(VT);
7134   V1 = DAG.getUNDEF(VT);
7135
7136   // Check if N implements a horizontal binop.
7137   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7138     SDValue Op = N->getOperand(i + BaseIdx);
7139
7140     // Skip UNDEFs.
7141     if (Op->isUndef()) {
7142       // Update the expected vector extract index.
7143       if (i * 2 == NumElts)
7144         ExpectedVExtractIdx = BaseIdx;
7145       ExpectedVExtractIdx += 2;
7146       continue;
7147     }
7148
7149     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7150
7151     if (!CanFold)
7152       break;
7153
7154     SDValue Op0 = Op.getOperand(0);
7155     SDValue Op1 = Op.getOperand(1);
7156
7157     // Try to match the following pattern:
7158     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7159     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7160         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7161         Op0.getOperand(0) == Op1.getOperand(0) &&
7162         isa<ConstantSDNode>(Op0.getOperand(1)) &&
7163         isa<ConstantSDNode>(Op1.getOperand(1)));
7164     if (!CanFold)
7165       break;
7166
7167     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7168     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7169
7170     if (i * 2 < NumElts) {
7171       if (V0.isUndef()) {
7172         V0 = Op0.getOperand(0);
7173         if (V0.getValueType() != VT)
7174           return false;
7175       }
7176     } else {
7177       if (V1.isUndef()) {
7178         V1 = Op0.getOperand(0);
7179         if (V1.getValueType() != VT)
7180           return false;
7181       }
7182       if (i * 2 == NumElts)
7183         ExpectedVExtractIdx = BaseIdx;
7184     }
7185
7186     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7187     if (I0 == ExpectedVExtractIdx)
7188       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7189     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7190       // Try to match the following dag sequence:
7191       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7192       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7193     } else
7194       CanFold = false;
7195
7196     ExpectedVExtractIdx += 2;
7197   }
7198
7199   return CanFold;
7200 }
7201
7202 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7203 /// a concat_vector.
7204 ///
7205 /// This is a helper function of LowerToHorizontalOp().
7206 /// This function expects two 256-bit vectors called V0 and V1.
7207 /// At first, each vector is split into two separate 128-bit vectors.
7208 /// Then, the resulting 128-bit vectors are used to implement two
7209 /// horizontal binary operations.
7210 ///
7211 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7212 ///
7213 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7214 /// the two new horizontal binop.
7215 /// When Mode is set, the first horizontal binop dag node would take as input
7216 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7217 /// horizontal binop dag node would take as input the lower 128-bit of V1
7218 /// and the upper 128-bit of V1.
7219 ///   Example:
7220 ///     HADD V0_LO, V0_HI
7221 ///     HADD V1_LO, V1_HI
7222 ///
7223 /// Otherwise, the first horizontal binop dag node takes as input the lower
7224 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7225 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7226 ///   Example:
7227 ///     HADD V0_LO, V1_LO
7228 ///     HADD V0_HI, V1_HI
7229 ///
7230 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7231 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7232 /// the upper 128-bits of the result.
7233 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7234                                      const SDLoc &DL, SelectionDAG &DAG,
7235                                      unsigned X86Opcode, bool Mode,
7236                                      bool isUndefLO, bool isUndefHI) {
7237   MVT VT = V0.getSimpleValueType();
7238   assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7239          "Invalid nodes in input!");
7240
7241   unsigned NumElts = VT.getVectorNumElements();
7242   SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7243   SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7244   SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7245   SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7246   MVT NewVT = V0_LO.getSimpleValueType();
7247
7248   SDValue LO = DAG.getUNDEF(NewVT);
7249   SDValue HI = DAG.getUNDEF(NewVT);
7250
7251   if (Mode) {
7252     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7253     if (!isUndefLO && !V0->isUndef())
7254       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7255     if (!isUndefHI && !V1->isUndef())
7256       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7257   } else {
7258     // Don't emit a horizontal binop if the result is expected to be UNDEF.
7259     if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7260       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7261
7262     if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7263       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7264   }
7265
7266   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7267 }
7268
7269 /// Returns true iff \p BV builds a vector with the result equivalent to
7270 /// the result of ADDSUB operation.
7271 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7272 /// are written to the parameters \p Opnd0 and \p Opnd1.
7273 static bool isAddSub(const BuildVectorSDNode *BV,
7274                      const X86Subtarget &Subtarget, SelectionDAG &DAG,
7275                      SDValue &Opnd0, SDValue &Opnd1) {
7276
7277   MVT VT = BV->getSimpleValueType(0);
7278   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7279       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7280       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7281     return false;
7282
7283   unsigned NumElts = VT.getVectorNumElements();
7284   SDValue InVec0 = DAG.getUNDEF(VT);
7285   SDValue InVec1 = DAG.getUNDEF(VT);
7286
7287   // Odd-numbered elements in the input build vector are obtained from
7288   // adding two integer/float elements.
7289   // Even-numbered elements in the input build vector are obtained from
7290   // subtracting two integer/float elements.
7291   unsigned ExpectedOpcode = ISD::FSUB;
7292   unsigned NextExpectedOpcode = ISD::FADD;
7293   bool AddFound = false;
7294   bool SubFound = false;
7295
7296   for (unsigned i = 0, e = NumElts; i != e; ++i) {
7297     SDValue Op = BV->getOperand(i);
7298
7299     // Skip 'undef' values.
7300     unsigned Opcode = Op.getOpcode();
7301     if (Opcode == ISD::UNDEF) {
7302       std::swap(ExpectedOpcode, NextExpectedOpcode);
7303       continue;
7304     }
7305
7306     // Early exit if we found an unexpected opcode.
7307     if (Opcode != ExpectedOpcode)
7308       return false;
7309
7310     SDValue Op0 = Op.getOperand(0);
7311     SDValue Op1 = Op.getOperand(1);
7312
7313     // Try to match the following pattern:
7314     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7315     // Early exit if we cannot match that sequence.
7316     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7317         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7318         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7319         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7320         Op0.getOperand(1) != Op1.getOperand(1))
7321       return false;
7322
7323     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7324     if (I0 != i)
7325       return false;
7326
7327     // We found a valid add/sub node. Update the information accordingly.
7328     if (i & 1)
7329       AddFound = true;
7330     else
7331       SubFound = true;
7332
7333     // Update InVec0 and InVec1.
7334     if (InVec0.isUndef()) {
7335       InVec0 = Op0.getOperand(0);
7336       if (InVec0.getSimpleValueType() != VT)
7337         return false;
7338     }
7339     if (InVec1.isUndef()) {
7340       InVec1 = Op1.getOperand(0);
7341       if (InVec1.getSimpleValueType() != VT)
7342         return false;
7343     }
7344
7345     // Make sure that operands in input to each add/sub node always
7346     // come from a same pair of vectors.
7347     if (InVec0 != Op0.getOperand(0)) {
7348       if (ExpectedOpcode == ISD::FSUB)
7349         return false;
7350
7351       // FADD is commutable. Try to commute the operands
7352       // and then test again.
7353       std::swap(Op0, Op1);
7354       if (InVec0 != Op0.getOperand(0))
7355         return false;
7356     }
7357
7358     if (InVec1 != Op1.getOperand(0))
7359       return false;
7360
7361     // Update the pair of expected opcodes.
7362     std::swap(ExpectedOpcode, NextExpectedOpcode);
7363   }
7364
7365   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7366   if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7367     return false;
7368
7369   Opnd0 = InVec0;
7370   Opnd1 = InVec1;
7371   return true;
7372 }
7373
7374 /// Returns true if is possible to fold MUL and an idiom that has already been
7375 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7376 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7377 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7378 ///
7379 /// Prior to calling this function it should be known that there is some
7380 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7381 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7382 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7383 /// of \p Opnd0 uses is expected to be equal to 2.
7384 /// For example, this function may be called for the following IR:
7385 ///    %AB = fmul fast <2 x double> %A, %B
7386 ///    %Sub = fsub fast <2 x double> %AB, %C
7387 ///    %Add = fadd fast <2 x double> %AB, %C
7388 ///    %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7389 ///                            <2 x i32> <i32 0, i32 3>
7390 /// There is a def for %Addsub here, which potentially can be replaced by
7391 /// X86ISD::ADDSUB operation:
7392 ///    %Addsub = X86ISD::ADDSUB %AB, %C
7393 /// and such ADDSUB can further be replaced with FMADDSUB:
7394 ///    %Addsub = FMADDSUB %A, %B, %C.
7395 ///
7396 /// The main reason why this method is called before the replacement of the
7397 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7398 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7399 /// FMADDSUB is.
7400 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7401                        SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7402   if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7403       !Subtarget.hasAnyFMA())
7404     return false;
7405
7406   // FIXME: These checks must match the similar ones in
7407   // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7408   // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7409   // or MUL + ADDSUB to FMADDSUB.
7410   const TargetOptions &Options = DAG.getTarget().Options;
7411   bool AllowFusion =
7412       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7413   if (!AllowFusion)
7414     return false;
7415
7416   Opnd2 = Opnd1;
7417   Opnd1 = Opnd0.getOperand(1);
7418   Opnd0 = Opnd0.getOperand(0);
7419
7420   return true;
7421 }
7422
7423 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7424 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7425 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7426                                        const X86Subtarget &Subtarget,
7427                                        SelectionDAG &DAG) {
7428   SDValue Opnd0, Opnd1;
7429   if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7430     return SDValue();
7431
7432   MVT VT = BV->getSimpleValueType(0);
7433   SDLoc DL(BV);
7434
7435   // Try to generate X86ISD::FMADDSUB node here.
7436   SDValue Opnd2;
7437   if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7438     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7439
7440   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7441   // the ADDSUB idiom has been successfully recognized. There are no known
7442   // X86 targets with 512-bit ADDSUB instructions!
7443   // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7444   // recognition.
7445   if (VT.is512BitVector())
7446     return SDValue();
7447
7448   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7449 }
7450
7451 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7452 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7453                                    const X86Subtarget &Subtarget,
7454                                    SelectionDAG &DAG) {
7455   MVT VT = BV->getSimpleValueType(0);
7456   unsigned NumElts = VT.getVectorNumElements();
7457   unsigned NumUndefsLO = 0;
7458   unsigned NumUndefsHI = 0;
7459   unsigned Half = NumElts/2;
7460
7461   // Count the number of UNDEF operands in the build_vector in input.
7462   for (unsigned i = 0, e = Half; i != e; ++i)
7463     if (BV->getOperand(i)->isUndef())
7464       NumUndefsLO++;
7465
7466   for (unsigned i = Half, e = NumElts; i != e; ++i)
7467     if (BV->getOperand(i)->isUndef())
7468       NumUndefsHI++;
7469
7470   // Early exit if this is either a build_vector of all UNDEFs or all the
7471   // operands but one are UNDEF.
7472   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7473     return SDValue();
7474
7475   SDLoc DL(BV);
7476   SDValue InVec0, InVec1;
7477   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7478     // Try to match an SSE3 float HADD/HSUB.
7479     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7480       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7481
7482     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7483       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7484   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7485     // Try to match an SSSE3 integer HADD/HSUB.
7486     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7487       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7488
7489     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7490       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7491   }
7492
7493   if (!Subtarget.hasAVX())
7494     return SDValue();
7495
7496   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7497     // Try to match an AVX horizontal add/sub of packed single/double
7498     // precision floating point values from 256-bit vectors.
7499     SDValue InVec2, InVec3;
7500     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7501         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7502         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7503         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7504       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7505
7506     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7507         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7508         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7509         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7510       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7511   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7512     // Try to match an AVX2 horizontal add/sub of signed integers.
7513     SDValue InVec2, InVec3;
7514     unsigned X86Opcode;
7515     bool CanFold = true;
7516
7517     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7518         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7519         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7520         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7521       X86Opcode = X86ISD::HADD;
7522     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7523         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7524         ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7525         ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7526       X86Opcode = X86ISD::HSUB;
7527     else
7528       CanFold = false;
7529
7530     if (CanFold) {
7531       // Fold this build_vector into a single horizontal add/sub.
7532       // Do this only if the target has AVX2.
7533       if (Subtarget.hasAVX2())
7534         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7535
7536       // Do not try to expand this build_vector into a pair of horizontal
7537       // add/sub if we can emit a pair of scalar add/sub.
7538       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7539         return SDValue();
7540
7541       // Convert this build_vector into a pair of horizontal binop followed by
7542       // a concat vector.
7543       bool isUndefLO = NumUndefsLO == Half;
7544       bool isUndefHI = NumUndefsHI == Half;
7545       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7546                                    isUndefLO, isUndefHI);
7547     }
7548   }
7549
7550   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7551        VT == MVT::v16i16) && Subtarget.hasAVX()) {
7552     unsigned X86Opcode;
7553     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7554       X86Opcode = X86ISD::HADD;
7555     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7556       X86Opcode = X86ISD::HSUB;
7557     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7558       X86Opcode = X86ISD::FHADD;
7559     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7560       X86Opcode = X86ISD::FHSUB;
7561     else
7562       return SDValue();
7563
7564     // Don't try to expand this build_vector into a pair of horizontal add/sub
7565     // if we can simply emit a pair of scalar add/sub.
7566     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7567       return SDValue();
7568
7569     // Convert this build_vector into two horizontal add/sub followed by
7570     // a concat vector.
7571     bool isUndefLO = NumUndefsLO == Half;
7572     bool isUndefHI = NumUndefsHI == Half;
7573     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7574                                  isUndefLO, isUndefHI);
7575   }
7576
7577   return SDValue();
7578 }
7579
7580 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7581 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7582 /// just apply the bit to the vectors.
7583 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7584 /// from this, but enough scalar bit operations are created from the later
7585 /// legalization + scalarization stages to need basic support.
7586 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7587                                        SelectionDAG &DAG) {
7588   SDLoc DL(Op);
7589   MVT VT = Op->getSimpleValueType(0);
7590   unsigned NumElems = VT.getVectorNumElements();
7591   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7592
7593   // Check that all elements have the same opcode.
7594   // TODO: Should we allow UNDEFS and if so how many?
7595   unsigned Opcode = Op->getOperand(0).getOpcode();
7596   for (unsigned i = 1; i < NumElems; ++i)
7597     if (Opcode != Op->getOperand(i).getOpcode())
7598       return SDValue();
7599
7600   // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7601   switch (Opcode) {
7602   default:
7603     return SDValue();
7604   case ISD::AND:
7605   case ISD::XOR:
7606   case ISD::OR:
7607     if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7608       return SDValue();
7609     break;
7610   }
7611
7612   SmallVector<SDValue, 4> LHSElts, RHSElts;
7613   for (SDValue Elt : Op->ops()) {
7614     SDValue LHS = Elt.getOperand(0);
7615     SDValue RHS = Elt.getOperand(1);
7616
7617     // We expect the canonicalized RHS operand to be the constant.
7618     if (!isa<ConstantSDNode>(RHS))
7619       return SDValue();
7620     LHSElts.push_back(LHS);
7621     RHSElts.push_back(RHS);
7622   }
7623
7624   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7625   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7626   return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7627 }
7628
7629 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7630 /// functionality to do this, so it's all zeros, all ones, or some derivation
7631 /// that is cheap to calculate.
7632 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7633                                          const X86Subtarget &Subtarget) {
7634   SDLoc DL(Op);
7635   MVT VT = Op.getSimpleValueType();
7636
7637   // Vectors containing all zeros can be matched by pxor and xorps.
7638   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7639     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7640     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7641     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7642       return Op;
7643
7644     return getZeroVector(VT, Subtarget, DAG, DL);
7645   }
7646
7647   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7648   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7649   // vpcmpeqd on 256-bit vectors.
7650   if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7651     if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7652         (VT == MVT::v8i32 && Subtarget.hasInt256()))
7653       return Op;
7654
7655     return getOnesVector(VT, DAG, DL);
7656   }
7657
7658   return SDValue();
7659 }
7660
7661 SDValue
7662 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7663   SDLoc dl(Op);
7664
7665   MVT VT = Op.getSimpleValueType();
7666   MVT ExtVT = VT.getVectorElementType();
7667   unsigned NumElems = Op.getNumOperands();
7668
7669   // Generate vectors for predicate vectors.
7670   if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7671     return LowerBUILD_VECTORvXi1(Op, DAG);
7672
7673   if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7674     return VectorConstant;
7675
7676   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7677   if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7678     return AddSub;
7679   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7680     return HorizontalOp;
7681   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7682     return Broadcast;
7683   if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7684     return BitOp;
7685
7686   unsigned EVTBits = ExtVT.getSizeInBits();
7687
7688   unsigned NumZero  = 0;
7689   unsigned NumNonZero = 0;
7690   uint64_t NonZeros = 0;
7691   bool IsAllConstants = true;
7692   SmallSet<SDValue, 8> Values;
7693   for (unsigned i = 0; i < NumElems; ++i) {
7694     SDValue Elt = Op.getOperand(i);
7695     if (Elt.isUndef())
7696       continue;
7697     Values.insert(Elt);
7698     if (Elt.getOpcode() != ISD::Constant &&
7699         Elt.getOpcode() != ISD::ConstantFP)
7700       IsAllConstants = false;
7701     if (X86::isZeroNode(Elt))
7702       NumZero++;
7703     else {
7704       assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7705       NonZeros |= ((uint64_t)1 << i);
7706       NumNonZero++;
7707     }
7708   }
7709
7710   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
7711   if (NumNonZero == 0)
7712     return DAG.getUNDEF(VT);
7713
7714   // Special case for single non-zero, non-undef, element.
7715   if (NumNonZero == 1) {
7716     unsigned Idx = countTrailingZeros(NonZeros);
7717     SDValue Item = Op.getOperand(Idx);
7718
7719     // If this is an insertion of an i64 value on x86-32, and if the top bits of
7720     // the value are obviously zero, truncate the value to i32 and do the
7721     // insertion that way.  Only do this if the value is non-constant or if the
7722     // value is a constant being inserted into element 0.  It is cheaper to do
7723     // a constant pool load than it is to do a movd + shuffle.
7724     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7725         (!IsAllConstants || Idx == 0)) {
7726       if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7727         // Handle SSE only.
7728         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7729         MVT VecVT = MVT::v4i32;
7730
7731         // Truncate the value (which may itself be a constant) to i32, and
7732         // convert it to a vector with movd (S2V+shuffle to zero extend).
7733         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7734         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7735         return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7736                                       Item, Idx * 2, true, Subtarget, DAG));
7737       }
7738     }
7739
7740     // If we have a constant or non-constant insertion into the low element of
7741     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7742     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
7743     // depending on what the source datatype is.
7744     if (Idx == 0) {
7745       if (NumZero == 0)
7746         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7747
7748       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7749           (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7750         assert((VT.is128BitVector() || VT.is256BitVector() ||
7751                 VT.is512BitVector()) &&
7752                "Expected an SSE value type!");
7753         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7754         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7755         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7756       }
7757
7758       // We can't directly insert an i8 or i16 into a vector, so zero extend
7759       // it to i32 first.
7760       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7761         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7762         if (VT.getSizeInBits() >= 256) {
7763           MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7764           if (Subtarget.hasAVX()) {
7765             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7766             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7767           } else {
7768             // Without AVX, we need to extend to a 128-bit vector and then
7769             // insert into the 256-bit vector.
7770             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7771             SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7772             Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7773           }
7774         } else {
7775           assert(VT.is128BitVector() && "Expected an SSE value type!");
7776           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7777           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7778         }
7779         return DAG.getBitcast(VT, Item);
7780       }
7781     }
7782
7783     // Is it a vector logical left shift?
7784     if (NumElems == 2 && Idx == 1 &&
7785         X86::isZeroNode(Op.getOperand(0)) &&
7786         !X86::isZeroNode(Op.getOperand(1))) {
7787       unsigned NumBits = VT.getSizeInBits();
7788       return getVShift(true, VT,
7789                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7790                                    VT, Op.getOperand(1)),
7791                        NumBits/2, DAG, *this, dl);
7792     }
7793
7794     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7795       return SDValue();
7796
7797     // Otherwise, if this is a vector with i32 or f32 elements, and the element
7798     // is a non-constant being inserted into an element other than the low one,
7799     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
7800     // movd/movss) to move this into the low element, then shuffle it into
7801     // place.
7802     if (EVTBits == 32) {
7803       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7804       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7805     }
7806   }
7807
7808   // Splat is obviously ok. Let legalizer expand it to a shuffle.
7809   if (Values.size() == 1) {
7810     if (EVTBits == 32) {
7811       // Instead of a shuffle like this:
7812       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7813       // Check if it's possible to issue this instead.
7814       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7815       unsigned Idx = countTrailingZeros(NonZeros);
7816       SDValue Item = Op.getOperand(Idx);
7817       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7818         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7819     }
7820     return SDValue();
7821   }
7822
7823   // A vector full of immediates; various special cases are already
7824   // handled, so this is best done with a single constant-pool load.
7825   if (IsAllConstants)
7826     return SDValue();
7827
7828   // See if we can use a vector load to get all of the elements.
7829   if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7830     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7831     if (SDValue LD =
7832             EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7833       return LD;
7834   }
7835
7836   // For AVX-length vectors, build the individual 128-bit pieces and use
7837   // shuffles to put them in place.
7838   if (VT.is256BitVector() || VT.is512BitVector()) {
7839     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7840
7841     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7842
7843     // Build both the lower and upper subvector.
7844     SDValue Lower =
7845         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7846     SDValue Upper = DAG.getBuildVector(
7847         HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7848
7849     // Recreate the wider vector with the lower and upper part.
7850     if (VT.is256BitVector())
7851       return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7852     return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7853   }
7854
7855   // Let legalizer expand 2-wide build_vectors.
7856   if (EVTBits == 64) {
7857     if (NumNonZero == 1) {
7858       // One half is zero or undef.
7859       unsigned Idx = countTrailingZeros(NonZeros);
7860       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7861                                Op.getOperand(Idx));
7862       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7863     }
7864     return SDValue();
7865   }
7866
7867   // If element VT is < 32 bits, convert it to inserts into a zero vector.
7868   if (EVTBits == 8 && NumElems == 16)
7869     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7870                                           DAG, Subtarget))
7871       return V;
7872
7873   if (EVTBits == 16 && NumElems == 8)
7874     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7875                                           DAG, Subtarget))
7876       return V;
7877
7878   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7879   if (EVTBits == 32 && NumElems == 4)
7880     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7881       return V;
7882
7883   // If element VT is == 32 bits, turn it into a number of shuffles.
7884   if (NumElems == 4 && NumZero > 0) {
7885     SmallVector<SDValue, 8> Ops(NumElems);
7886     for (unsigned i = 0; i < 4; ++i) {
7887       bool isZero = !(NonZeros & (1ULL << i));
7888       if (isZero)
7889         Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7890       else
7891         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7892     }
7893
7894     for (unsigned i = 0; i < 2; ++i) {
7895       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7896         default: break;
7897         case 0:
7898           Ops[i] = Ops[i*2];  // Must be a zero vector.
7899           break;
7900         case 1:
7901           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7902           break;
7903         case 2:
7904           Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7905           break;
7906         case 3:
7907           Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7908           break;
7909       }
7910     }
7911
7912     bool Reverse1 = (NonZeros & 0x3) == 2;
7913     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7914     int MaskVec[] = {
7915       Reverse1 ? 1 : 0,
7916       Reverse1 ? 0 : 1,
7917       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7918       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
7919     };
7920     return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7921   }
7922
7923   if (Values.size() > 1 && VT.is128BitVector()) {
7924     // Check for a build vector from mostly shuffle plus few inserting.
7925     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7926       return Sh;
7927
7928     // For SSE 4.1, use insertps to put the high elements into the low element.
7929     if (Subtarget.hasSSE41()) {
7930       SDValue Result;
7931       if (!Op.getOperand(0).isUndef())
7932         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7933       else
7934         Result = DAG.getUNDEF(VT);
7935
7936       for (unsigned i = 1; i < NumElems; ++i) {
7937         if (Op.getOperand(i).isUndef()) continue;
7938         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7939                              Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7940       }
7941       return Result;
7942     }
7943
7944     // Otherwise, expand into a number of unpckl*, start by extending each of
7945     // our (non-undef) elements to the full vector width with the element in the
7946     // bottom slot of the vector (which generates no code for SSE).
7947     SmallVector<SDValue, 8> Ops(NumElems);
7948     for (unsigned i = 0; i < NumElems; ++i) {
7949       if (!Op.getOperand(i).isUndef())
7950         Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7951       else
7952         Ops[i] = DAG.getUNDEF(VT);
7953     }
7954
7955     // Next, we iteratively mix elements, e.g. for v4f32:
7956     //   Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7957     //         : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7958     //   Step 2: unpcklpd X, Y ==>    <3, 2, 1, 0>
7959     for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
7960       // Generate scaled UNPCKL shuffle mask.
7961       SmallVector<int, 16> Mask;
7962       for(unsigned i = 0; i != Scale; ++i)
7963         Mask.push_back(i);
7964       for (unsigned i = 0; i != Scale; ++i)
7965         Mask.push_back(NumElems+i);
7966       Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7967
7968       for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
7969         Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
7970     }
7971     return Ops[0];
7972   }
7973   return SDValue();
7974 }
7975
7976 // 256-bit AVX can use the vinsertf128 instruction
7977 // to create 256-bit vectors from two other 128-bit ones.
7978 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7979   SDLoc dl(Op);
7980   MVT ResVT = Op.getSimpleValueType();
7981
7982   assert((ResVT.is256BitVector() ||
7983           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7984
7985   SDValue V1 = Op.getOperand(0);
7986   SDValue V2 = Op.getOperand(1);
7987   unsigned NumElems = ResVT.getVectorNumElements();
7988   if (ResVT.is256BitVector())
7989     return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7990
7991   if (Op.getNumOperands() == 4) {
7992     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7993                                   ResVT.getVectorNumElements()/2);
7994     SDValue V3 = Op.getOperand(2);
7995     SDValue V4 = Op.getOperand(3);
7996     return concat256BitVectors(
7997         concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7998         concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7999         NumElems, DAG, dl);
8000   }
8001   return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8002 }
8003
8004 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8005 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8006 static bool isExpandWithZeros(const SDValue &Op) {
8007   assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8008          "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8009
8010   for (unsigned i = 1; i < Op.getNumOperands(); i++)
8011     if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8012       return false;
8013
8014   return true;
8015 }
8016
8017 // Returns true if the given node is a type promotion (by concatenating i1
8018 // zeros) of the result of a node that already zeros all upper bits of
8019 // k-register.
8020 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8021   unsigned Opc = Op.getOpcode();
8022
8023   assert(Opc == ISD::CONCAT_VECTORS &&
8024          Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8025          "Unexpected node to check for type promotion!");
8026
8027   // As long as we are concatenating zeros to the upper part of a previous node
8028   // result, climb up the tree until a node with different opcode is
8029   // encountered
8030   while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8031     if (Opc == ISD::INSERT_SUBVECTOR) {
8032       if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8033           Op.getConstantOperandVal(2) == 0)
8034         Op = Op.getOperand(1);
8035       else
8036         return SDValue();
8037     } else { // Opc == ISD::CONCAT_VECTORS
8038       if (isExpandWithZeros(Op))
8039         Op = Op.getOperand(0);
8040       else
8041         return SDValue();
8042     }
8043     Opc = Op.getOpcode();
8044   }
8045
8046   // Check if the first inserted node zeroes the upper bits, or an 'and' result
8047   // of a node that zeros the upper bits (its masked version).
8048   if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8049       (Op.getOpcode() == ISD::AND &&
8050        (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8051         isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8052     return Op;
8053   }
8054
8055   return SDValue();
8056 }
8057
8058 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8059                                        const X86Subtarget &Subtarget,
8060                                        SelectionDAG & DAG) {
8061   SDLoc dl(Op);
8062   MVT ResVT = Op.getSimpleValueType();
8063   unsigned NumOfOperands = Op.getNumOperands();
8064
8065   assert(isPowerOf2_32(NumOfOperands) &&
8066          "Unexpected number of operands in CONCAT_VECTORS");
8067
8068   // If this node promotes - by concatenating zeroes - the type of the result
8069   // of a node with instruction that zeroes all upper (irrelevant) bits of the
8070   // output register, mark it as legal and catch the pattern in instruction
8071   // selection to avoid emitting extra insturctions (for zeroing upper bits).
8072   if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8073     SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
8074     SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
8075     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8076                        ZeroC);
8077   }
8078
8079   SDValue Undef = DAG.getUNDEF(ResVT);
8080   if (NumOfOperands > 2) {
8081     // Specialize the cases when all, or all but one, of the operands are undef.
8082     unsigned NumOfDefinedOps = 0;
8083     unsigned OpIdx = 0;
8084     for (unsigned i = 0; i < NumOfOperands; i++)
8085       if (!Op.getOperand(i).isUndef()) {
8086         NumOfDefinedOps++;
8087         OpIdx = i;
8088       }
8089     if (NumOfDefinedOps == 0)
8090       return Undef;
8091     if (NumOfDefinedOps == 1) {
8092       unsigned SubVecNumElts =
8093         Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8094       SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8095       return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8096                          Op.getOperand(OpIdx), IdxVal);
8097     }
8098
8099     MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8100                                   ResVT.getVectorNumElements()/2);
8101     SmallVector<SDValue, 2> Ops;
8102     for (unsigned i = 0; i < NumOfOperands/2; i++)
8103       Ops.push_back(Op.getOperand(i));
8104     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8105     Ops.clear();
8106     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8107       Ops.push_back(Op.getOperand(i));
8108     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8109     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8110   }
8111
8112   // 2 operands
8113   SDValue V1 = Op.getOperand(0);
8114   SDValue V2 = Op.getOperand(1);
8115   unsigned NumElems = ResVT.getVectorNumElements();
8116   assert(V1.getValueType() == V2.getValueType() &&
8117          V1.getValueType().getVectorNumElements() == NumElems/2 &&
8118          "Unexpected operands in CONCAT_VECTORS");
8119
8120   if (ResVT.getSizeInBits() >= 16)
8121     return Op; // The operation is legal with KUNPCK
8122
8123   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8124   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8125   SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8126   if (IsZeroV1 && IsZeroV2)
8127     return ZeroVec;
8128
8129   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8130   if (V2.isUndef())
8131     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8132   if (IsZeroV2)
8133     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8134
8135   SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8136   if (V1.isUndef())
8137     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8138
8139   if (IsZeroV1)
8140     return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8141
8142   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8143   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8144 }
8145
8146 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8147                                    const X86Subtarget &Subtarget,
8148                                    SelectionDAG &DAG) {
8149   MVT VT = Op.getSimpleValueType();
8150   if (VT.getVectorElementType() == MVT::i1)
8151     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8152
8153   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8154          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8155           Op.getNumOperands() == 4)));
8156
8157   // AVX can use the vinsertf128 instruction to create 256-bit vectors
8158   // from two other 128-bit ones.
8159
8160   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8161   return LowerAVXCONCAT_VECTORS(Op, DAG);
8162 }
8163
8164 //===----------------------------------------------------------------------===//
8165 // Vector shuffle lowering
8166 //
8167 // This is an experimental code path for lowering vector shuffles on x86. It is
8168 // designed to handle arbitrary vector shuffles and blends, gracefully
8169 // degrading performance as necessary. It works hard to recognize idiomatic
8170 // shuffles and lower them to optimal instruction patterns without leaving
8171 // a framework that allows reasonably efficient handling of all vector shuffle
8172 // patterns.
8173 //===----------------------------------------------------------------------===//
8174
8175 /// \brief Tiny helper function to identify a no-op mask.
8176 ///
8177 /// This is a somewhat boring predicate function. It checks whether the mask
8178 /// array input, which is assumed to be a single-input shuffle mask of the kind
8179 /// used by the X86 shuffle instructions (not a fully general
8180 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8181 /// in-place shuffle are 'no-op's.
8182 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8183   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8184     assert(Mask[i] >= -1 && "Out of bound mask element!");
8185     if (Mask[i] >= 0 && Mask[i] != i)
8186       return false;
8187   }
8188   return true;
8189 }
8190
8191 /// \brief Test whether there are elements crossing 128-bit lanes in this
8192 /// shuffle mask.
8193 ///
8194 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8195 /// and we routinely test for these.
8196 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8197   int LaneSize = 128 / VT.getScalarSizeInBits();
8198   int Size = Mask.size();
8199   for (int i = 0; i < Size; ++i)
8200     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8201       return true;
8202   return false;
8203 }
8204
8205 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8206 ///
8207 /// This checks a shuffle mask to see if it is performing the same
8208 /// lane-relative shuffle in each sub-lane. This trivially implies
8209 /// that it is also not lane-crossing. It may however involve a blend from the
8210 /// same lane of a second vector.
8211 ///
8212 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8213 /// non-trivial to compute in the face of undef lanes. The representation is
8214 /// suitable for use with existing 128-bit shuffles as entries from the second
8215 /// vector have been remapped to [LaneSize, 2*LaneSize).
8216 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8217                                   ArrayRef<int> Mask,
8218                                   SmallVectorImpl<int> &RepeatedMask) {
8219   auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8220   RepeatedMask.assign(LaneSize, -1);
8221   int Size = Mask.size();
8222   for (int i = 0; i < Size; ++i) {
8223     assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8224     if (Mask[i] < 0)
8225       continue;
8226     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8227       // This entry crosses lanes, so there is no way to model this shuffle.
8228       return false;
8229
8230     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8231     // Adjust second vector indices to start at LaneSize instead of Size.
8232     int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8233                                 : Mask[i] % LaneSize + LaneSize;
8234     if (RepeatedMask[i % LaneSize] < 0)
8235       // This is the first non-undef entry in this slot of a 128-bit lane.
8236       RepeatedMask[i % LaneSize] = LocalM;
8237     else if (RepeatedMask[i % LaneSize] != LocalM)
8238       // Found a mismatch with the repeated mask.
8239       return false;
8240   }
8241   return true;
8242 }
8243
8244 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8245 static bool
8246 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8247                                 SmallVectorImpl<int> &RepeatedMask) {
8248   return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8249 }
8250
8251 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8252 static bool
8253 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8254                                 SmallVectorImpl<int> &RepeatedMask) {
8255   return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8256 }
8257
8258 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8259 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8260 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8261                                         ArrayRef<int> Mask,
8262                                         SmallVectorImpl<int> &RepeatedMask) {
8263   int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8264   RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8265   int Size = Mask.size();
8266   for (int i = 0; i < Size; ++i) {
8267     assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8268     if (Mask[i] == SM_SentinelUndef)
8269       continue;
8270     if (Mask[i] == SM_SentinelZero) {
8271       if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8272         return false;
8273       RepeatedMask[i % LaneSize] = SM_SentinelZero;
8274       continue;
8275     }
8276     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8277       // This entry crosses lanes, so there is no way to model this shuffle.
8278       return false;
8279
8280     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8281     // Adjust second vector indices to start at LaneSize instead of Size.
8282     int LocalM =
8283         Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8284     if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8285       // This is the first non-undef entry in this slot of a 128-bit lane.
8286       RepeatedMask[i % LaneSize] = LocalM;
8287     else if (RepeatedMask[i % LaneSize] != LocalM)
8288       // Found a mismatch with the repeated mask.
8289       return false;
8290   }
8291   return true;
8292 }
8293
8294 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8295 /// arguments.
8296 ///
8297 /// This is a fast way to test a shuffle mask against a fixed pattern:
8298 ///
8299 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8300 ///
8301 /// It returns true if the mask is exactly as wide as the argument list, and
8302 /// each element of the mask is either -1 (signifying undef) or the value given
8303 /// in the argument.
8304 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8305                                 ArrayRef<int> ExpectedMask) {
8306   if (Mask.size() != ExpectedMask.size())
8307     return false;
8308
8309   int Size = Mask.size();
8310
8311   // If the values are build vectors, we can look through them to find
8312   // equivalent inputs that make the shuffles equivalent.
8313   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8314   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8315
8316   for (int i = 0; i < Size; ++i) {
8317     assert(Mask[i] >= -1 && "Out of bound mask element!");
8318     if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8319       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8320       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8321       if (!MaskBV || !ExpectedBV ||
8322           MaskBV->getOperand(Mask[i] % Size) !=
8323               ExpectedBV->getOperand(ExpectedMask[i] % Size))
8324         return false;
8325     }
8326   }
8327
8328   return true;
8329 }
8330
8331 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8332 ///
8333 /// The masks must be exactly the same width.
8334 ///
8335 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8336 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8337 ///
8338 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8339 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8340                                       ArrayRef<int> ExpectedMask) {
8341   int Size = Mask.size();
8342   if (Size != (int)ExpectedMask.size())
8343     return false;
8344
8345   for (int i = 0; i < Size; ++i)
8346     if (Mask[i] == SM_SentinelUndef)
8347       continue;
8348     else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8349       return false;
8350     else if (Mask[i] != ExpectedMask[i])
8351       return false;
8352
8353   return true;
8354 }
8355
8356 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8357 // mask.
8358 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8359                                                     const APInt &Zeroable) {
8360   int NumElts = Mask.size();
8361   assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8362
8363   SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8364   for (int i = 0; i != NumElts; ++i) {
8365     int M = Mask[i];
8366     if (M == SM_SentinelUndef)
8367       continue;
8368     assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8369     TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8370   }
8371   return TargetMask;
8372 }
8373
8374 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8375 // instructions.
8376 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8377   if (VT != MVT::v8i32 && VT != MVT::v8f32)
8378     return false;
8379
8380   SmallVector<int, 8> Unpcklwd;
8381   createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8382                           /* Unary = */ false);
8383   SmallVector<int, 8> Unpckhwd;
8384   createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8385                           /* Unary = */ false);
8386   bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8387                          isTargetShuffleEquivalent(Mask, Unpckhwd));
8388   return IsUnpackwdMask;
8389 }
8390
8391 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8392 ///
8393 /// This helper function produces an 8-bit shuffle immediate corresponding to
8394 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8395 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8396 /// example.
8397 ///
8398 /// NB: We rely heavily on "undef" masks preserving the input lane.
8399 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8400   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8401   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8402   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8403   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8404   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8405
8406   unsigned Imm = 0;
8407   Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8408   Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8409   Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8410   Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8411   return Imm;
8412 }
8413
8414 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8415                                           SelectionDAG &DAG) {
8416   return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8417 }
8418
8419 /// \brief Compute whether each element of a shuffle is zeroable.
8420 ///
8421 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8422 /// Either it is an undef element in the shuffle mask, the element of the input
8423 /// referenced is undef, or the element of the input referenced is known to be
8424 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8425 /// as many lanes with this technique as possible to simplify the remaining
8426 /// shuffle.
8427 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8428                                             SDValue V1, SDValue V2) {
8429   APInt Zeroable(Mask.size(), 0);
8430   V1 = peekThroughBitcasts(V1);
8431   V2 = peekThroughBitcasts(V2);
8432
8433   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8434   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8435
8436   int VectorSizeInBits = V1.getValueSizeInBits();
8437   int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8438   assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8439
8440   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8441     int M = Mask[i];
8442     // Handle the easy cases.
8443     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8444       Zeroable.setBit(i);
8445       continue;
8446     }
8447
8448     // Determine shuffle input and normalize the mask.
8449     SDValue V = M < Size ? V1 : V2;
8450     M %= Size;
8451
8452     // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8453     if (V.getOpcode() != ISD::BUILD_VECTOR)
8454       continue;
8455
8456     // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8457     // the (larger) source element must be UNDEF/ZERO.
8458     if ((Size % V.getNumOperands()) == 0) {
8459       int Scale = Size / V->getNumOperands();
8460       SDValue Op = V.getOperand(M / Scale);
8461       if (Op.isUndef() || X86::isZeroNode(Op))
8462         Zeroable.setBit(i);
8463       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8464         APInt Val = Cst->getAPIntValue();
8465         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8466         Val = Val.getLoBits(ScalarSizeInBits);
8467         if (Val == 0)
8468           Zeroable.setBit(i);
8469       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8470         APInt Val = Cst->getValueAPF().bitcastToAPInt();
8471         Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8472         Val = Val.getLoBits(ScalarSizeInBits);
8473         if (Val == 0)
8474           Zeroable.setBit(i);
8475       }
8476       continue;
8477     }
8478
8479     // If the BUILD_VECTOR has more elements then all the (smaller) source
8480     // elements must be UNDEF or ZERO.
8481     if ((V.getNumOperands() % Size) == 0) {
8482       int Scale = V->getNumOperands() / Size;
8483       bool AllZeroable = true;
8484       for (int j = 0; j < Scale; ++j) {
8485         SDValue Op = V.getOperand((M * Scale) + j);
8486         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8487       }
8488       if (AllZeroable)
8489         Zeroable.setBit(i);
8490       continue;
8491     }
8492   }
8493
8494   return Zeroable;
8495 }
8496
8497 // The Shuffle result is as follow:
8498 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8499 // Each Zeroable's element correspond to a particular Mask's element.
8500 // As described in computeZeroableShuffleElements function.
8501 //
8502 // The function looks for a sub-mask that the nonzero elements are in
8503 // increasing order. If such sub-mask exist. The function returns true.
8504 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8505                                      ArrayRef<int> Mask, const EVT &VectorType,
8506                                      bool &IsZeroSideLeft) {
8507   int NextElement = -1;
8508   // Check if the Mask's nonzero elements are in increasing order.
8509   for (int i = 0, e = Mask.size(); i < e; i++) {
8510     // Checks if the mask's zeros elements are built from only zeros.
8511     assert(Mask[i] >= -1 && "Out of bound mask element!");
8512     if (Mask[i] < 0)
8513       return false;
8514     if (Zeroable[i])
8515       continue;
8516     // Find the lowest non zero element
8517     if (NextElement < 0) {
8518       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8519       IsZeroSideLeft = NextElement != 0;
8520     }
8521     // Exit if the mask's non zero elements are not in increasing order.
8522     if (NextElement != Mask[i])
8523       return false;
8524     NextElement++;
8525   }
8526   return true;
8527 }
8528
8529 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8530 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8531                                             ArrayRef<int> Mask, SDValue V1,
8532                                             SDValue V2,
8533                                             const APInt &Zeroable,
8534                                             const X86Subtarget &Subtarget,
8535                                             SelectionDAG &DAG) {
8536   int Size = Mask.size();
8537   int LaneSize = 128 / VT.getScalarSizeInBits();
8538   const int NumBytes = VT.getSizeInBits() / 8;
8539   const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8540
8541   assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8542          (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8543          (Subtarget.hasBWI() && VT.is512BitVector()));
8544
8545   SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8546   // Sign bit set in i8 mask means zero element.
8547   SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8548
8549   SDValue V;
8550   for (int i = 0; i < NumBytes; ++i) {
8551     int M = Mask[i / NumEltBytes];
8552     if (M < 0) {
8553       PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8554       continue;
8555     }
8556     if (Zeroable[i / NumEltBytes]) {
8557       PSHUFBMask[i] = ZeroMask;
8558       continue;
8559     }
8560
8561     // We can only use a single input of V1 or V2.
8562     SDValue SrcV = (M >= Size ? V2 : V1);
8563     if (V && V != SrcV)
8564       return SDValue();
8565     V = SrcV;
8566     M %= Size;
8567
8568     // PSHUFB can't cross lanes, ensure this doesn't happen.
8569     if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8570       return SDValue();
8571
8572     M = M % LaneSize;
8573     M = M * NumEltBytes + (i % NumEltBytes);
8574     PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8575   }
8576   assert(V && "Failed to find a source input");
8577
8578   MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8579   return DAG.getBitcast(
8580       VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8581                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8582 }
8583
8584 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8585                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
8586                            const SDLoc &dl);
8587
8588 // X86 has dedicated shuffle that can be lowered to VEXPAND
8589 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8590                                           const APInt &Zeroable,
8591                                           ArrayRef<int> Mask, SDValue &V1,
8592                                           SDValue &V2, SelectionDAG &DAG,
8593                                           const X86Subtarget &Subtarget) {
8594   bool IsLeftZeroSide = true;
8595   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8596                                 IsLeftZeroSide))
8597     return SDValue();
8598   unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8599   MVT IntegerType =
8600       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8601   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8602   unsigned NumElts = VT.getVectorNumElements();
8603   assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8604          "Unexpected number of vector elements");
8605   SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8606                               Subtarget, DAG, DL);
8607   SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8608   SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8609   return DAG.getSelect(DL, VT, VMask,
8610                        DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8611                        ZeroVector);
8612 }
8613
8614 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8615                                         unsigned &UnpackOpcode, bool IsUnary,
8616                                         ArrayRef<int> TargetMask, SDLoc &DL,
8617                                         SelectionDAG &DAG,
8618                                         const X86Subtarget &Subtarget) {
8619   int NumElts = VT.getVectorNumElements();
8620
8621   bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8622   for (int i = 0; i != NumElts; i += 2) {
8623     int M1 = TargetMask[i + 0];
8624     int M2 = TargetMask[i + 1];
8625     Undef1 &= (SM_SentinelUndef == M1);
8626     Undef2 &= (SM_SentinelUndef == M2);
8627     Zero1 &= isUndefOrZero(M1);
8628     Zero2 &= isUndefOrZero(M2);
8629   }
8630   assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8631          "Zeroable shuffle detected");
8632
8633   // Attempt to match the target mask against the unpack lo/hi mask patterns.
8634   SmallVector<int, 64> Unpckl, Unpckh;
8635   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8636   if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8637     UnpackOpcode = X86ISD::UNPCKL;
8638     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8639     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8640     return true;
8641   }
8642
8643   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8644   if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8645     UnpackOpcode = X86ISD::UNPCKH;
8646     V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8647     V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8648     return true;
8649   }
8650
8651   // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8652   if (IsUnary && (Zero1 || Zero2)) {
8653     // Don't bother if we can blend instead.
8654     if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8655         isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8656       return false;
8657
8658     bool MatchLo = true, MatchHi = true;
8659     for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8660       int M = TargetMask[i];
8661
8662       // Ignore if the input is known to be zero or the index is undef.
8663       if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8664           (M == SM_SentinelUndef))
8665         continue;
8666
8667       MatchLo &= (M == Unpckl[i]);
8668       MatchHi &= (M == Unpckh[i]);
8669     }
8670
8671     if (MatchLo || MatchHi) {
8672       UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8673       V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8674       V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8675       return true;
8676     }
8677   }
8678
8679   // If a binary shuffle, commute and try again.
8680   if (!IsUnary) {
8681     ShuffleVectorSDNode::commuteMask(Unpckl);
8682     if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8683       UnpackOpcode = X86ISD::UNPCKL;
8684       std::swap(V1, V2);
8685       return true;
8686     }
8687
8688     ShuffleVectorSDNode::commuteMask(Unpckh);
8689     if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8690       UnpackOpcode = X86ISD::UNPCKH;
8691       std::swap(V1, V2);
8692       return true;
8693     }
8694   }
8695
8696   return false;
8697 }
8698
8699 // X86 has dedicated unpack instructions that can handle specific blend
8700 // operations: UNPCKH and UNPCKL.
8701 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8702                                            ArrayRef<int> Mask, SDValue V1,
8703                                            SDValue V2, SelectionDAG &DAG) {
8704   SmallVector<int, 8> Unpckl;
8705   createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8706   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8707     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8708
8709   SmallVector<int, 8> Unpckh;
8710   createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8711   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8712     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8713
8714   // Commute and try again.
8715   ShuffleVectorSDNode::commuteMask(Unpckl);
8716   if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8717     return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8718
8719   ShuffleVectorSDNode::commuteMask(Unpckh);
8720   if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8721     return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8722
8723   return SDValue();
8724 }
8725
8726 /// \brief Try to emit a bitmask instruction for a shuffle.
8727 ///
8728 /// This handles cases where we can model a blend exactly as a bitmask due to
8729 /// one of the inputs being zeroable.
8730 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8731                                            SDValue V2, ArrayRef<int> Mask,
8732                                            const APInt &Zeroable,
8733                                            SelectionDAG &DAG) {
8734   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8735   MVT EltVT = VT.getVectorElementType();
8736   SDValue Zero = DAG.getConstant(0, DL, EltVT);
8737   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8738   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8739   SDValue V;
8740   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8741     if (Zeroable[i])
8742       continue;
8743     if (Mask[i] % Size != i)
8744       return SDValue(); // Not a blend.
8745     if (!V)
8746       V = Mask[i] < Size ? V1 : V2;
8747     else if (V != (Mask[i] < Size ? V1 : V2))
8748       return SDValue(); // Can only let one input through the mask.
8749
8750     VMaskOps[i] = AllOnes;
8751   }
8752   if (!V)
8753     return SDValue(); // No non-zeroable elements!
8754
8755   SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8756   return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8757 }
8758
8759 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8760 ///
8761 /// This is used as a fallback approach when first class blend instructions are
8762 /// unavailable. Currently it is only suitable for integer vectors, but could
8763 /// be generalized for floating point vectors if desirable.
8764 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8765                                             SDValue V2, ArrayRef<int> Mask,
8766                                             SelectionDAG &DAG) {
8767   assert(VT.isInteger() && "Only supports integer vector types!");
8768   MVT EltVT = VT.getVectorElementType();
8769   SDValue Zero = DAG.getConstant(0, DL, EltVT);
8770   SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8771   SmallVector<SDValue, 16> MaskOps;
8772   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8773     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8774       return SDValue(); // Shuffled input!
8775     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8776   }
8777
8778   SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8779   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8780   // We have to cast V2 around.
8781   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8782   V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8783                                       DAG.getBitcast(MaskVT, V1Mask),
8784                                       DAG.getBitcast(MaskVT, V2)));
8785   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8786 }
8787
8788 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8789                                     SDValue PreservedSrc,
8790                                     const X86Subtarget &Subtarget,
8791                                     SelectionDAG &DAG);
8792
8793 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8794                                       MutableArrayRef<int> TargetMask,
8795                                       bool &ForceV1Zero, bool &ForceV2Zero,
8796                                       uint64_t &BlendMask) {
8797   bool V1IsZeroOrUndef =
8798       V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8799   bool V2IsZeroOrUndef =
8800       V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8801
8802   BlendMask = 0;
8803   ForceV1Zero = false, ForceV2Zero = false;
8804   assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8805
8806   // Attempt to generate the binary blend mask. If an input is zero then
8807   // we can use any lane.
8808   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8809   for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8810     int M = TargetMask[i];
8811     if (M == SM_SentinelUndef)
8812       continue;
8813     if (M == i)
8814       continue;
8815     if (M == i + Size) {
8816       BlendMask |= 1ull << i;
8817       continue;
8818     }
8819     if (M == SM_SentinelZero) {
8820       if (V1IsZeroOrUndef) {
8821         ForceV1Zero = true;
8822         TargetMask[i] = i;
8823         continue;
8824       }
8825       if (V2IsZeroOrUndef) {
8826         ForceV2Zero = true;
8827         BlendMask |= 1ull << i;
8828         TargetMask[i] = i + Size;
8829         continue;
8830       }
8831     }
8832     return false;
8833   }
8834   return true;
8835 }
8836
8837 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8838   uint64_t ScaledMask = 0;
8839   for (int i = 0; i != Size; ++i)
8840     if (BlendMask & (1ull << i))
8841       ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8842   return ScaledMask;
8843 }
8844
8845 /// \brief Try to emit a blend instruction for a shuffle.
8846 ///
8847 /// This doesn't do any checks for the availability of instructions for blending
8848 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8849 /// be matched in the backend with the type given. What it does check for is
8850 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8851 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8852                                          SDValue V2, ArrayRef<int> Original,
8853                                          const APInt &Zeroable,
8854                                          const X86Subtarget &Subtarget,
8855                                          SelectionDAG &DAG) {
8856   SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8857
8858   uint64_t BlendMask = 0;
8859   bool ForceV1Zero = false, ForceV2Zero = false;
8860   if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8861                                  BlendMask))
8862     return SDValue();
8863
8864   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8865   if (ForceV1Zero)
8866     V1 = getZeroVector(VT, Subtarget, DAG, DL);
8867   if (ForceV2Zero)
8868     V2 = getZeroVector(VT, Subtarget, DAG, DL);
8869
8870   switch (VT.SimpleTy) {
8871   case MVT::v2f64:
8872   case MVT::v4f32:
8873   case MVT::v4f64:
8874   case MVT::v8f32:
8875     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8876                        DAG.getConstant(BlendMask, DL, MVT::i8));
8877
8878   case MVT::v4i64:
8879   case MVT::v8i32:
8880     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8881     LLVM_FALLTHROUGH;
8882   case MVT::v2i64:
8883   case MVT::v4i32:
8884     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8885     // that instruction.
8886     if (Subtarget.hasAVX2()) {
8887       // Scale the blend by the number of 32-bit dwords per element.
8888       int Scale =  VT.getScalarSizeInBits() / 32;
8889       BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8890       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8891       V1 = DAG.getBitcast(BlendVT, V1);
8892       V2 = DAG.getBitcast(BlendVT, V2);
8893       return DAG.getBitcast(
8894           VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8895                           DAG.getConstant(BlendMask, DL, MVT::i8)));
8896     }
8897     LLVM_FALLTHROUGH;
8898   case MVT::v8i16: {
8899     // For integer shuffles we need to expand the mask and cast the inputs to
8900     // v8i16s prior to blending.
8901     int Scale = 8 / VT.getVectorNumElements();
8902     BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8903     V1 = DAG.getBitcast(MVT::v8i16, V1);
8904     V2 = DAG.getBitcast(MVT::v8i16, V2);
8905     return DAG.getBitcast(VT,
8906                           DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8907                                       DAG.getConstant(BlendMask, DL, MVT::i8)));
8908   }
8909
8910   case MVT::v16i16: {
8911     assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8912     SmallVector<int, 8> RepeatedMask;
8913     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8914       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8915       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8916       BlendMask = 0;
8917       for (int i = 0; i < 8; ++i)
8918         if (RepeatedMask[i] >= 8)
8919           BlendMask |= 1ull << i;
8920       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8921                          DAG.getConstant(BlendMask, DL, MVT::i8));
8922     }
8923     LLVM_FALLTHROUGH;
8924   }
8925   case MVT::v16i8:
8926   case MVT::v32i8: {
8927     assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8928            "256-bit byte-blends require AVX2 support!");
8929
8930     if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8931       MVT IntegerType =
8932           MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8933       SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8934       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8935     }
8936
8937     // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8938     if (SDValue Masked =
8939             lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8940       return Masked;
8941
8942     // Scale the blend by the number of bytes per element.
8943     int Scale = VT.getScalarSizeInBits() / 8;
8944
8945     // This form of blend is always done on bytes. Compute the byte vector
8946     // type.
8947     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8948
8949     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8950     // mix of LLVM's code generator and the x86 backend. We tell the code
8951     // generator that boolean values in the elements of an x86 vector register
8952     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8953     // mapping a select to operand #1, and 'false' mapping to operand #2. The
8954     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8955     // of the element (the remaining are ignored) and 0 in that high bit would
8956     // mean operand #1 while 1 in the high bit would mean operand #2. So while
8957     // the LLVM model for boolean values in vector elements gets the relevant
8958     // bit set, it is set backwards and over constrained relative to x86's
8959     // actual model.
8960     SmallVector<SDValue, 32> VSELECTMask;
8961     for (int i = 0, Size = Mask.size(); i < Size; ++i)
8962       for (int j = 0; j < Scale; ++j)
8963         VSELECTMask.push_back(
8964             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8965                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8966                                           MVT::i8));
8967
8968     V1 = DAG.getBitcast(BlendVT, V1);
8969     V2 = DAG.getBitcast(BlendVT, V2);
8970     return DAG.getBitcast(
8971         VT,
8972         DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8973                       V1, V2));
8974   }
8975   case MVT::v16f32:
8976   case MVT::v8f64:
8977   case MVT::v8i64:
8978   case MVT::v16i32:
8979   case MVT::v32i16:
8980   case MVT::v64i8: {
8981     MVT IntegerType =
8982         MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8983     SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8984     return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8985   }
8986   default:
8987     llvm_unreachable("Not a supported integer vector type!");
8988   }
8989 }
8990
8991 /// \brief Try to lower as a blend of elements from two inputs followed by
8992 /// a single-input permutation.
8993 ///
8994 /// This matches the pattern where we can blend elements from two inputs and
8995 /// then reduce the shuffle to a single-input permutation.
8996 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8997                                                    SDValue V1, SDValue V2,
8998                                                    ArrayRef<int> Mask,
8999                                                    SelectionDAG &DAG) {
9000   // We build up the blend mask while checking whether a blend is a viable way
9001   // to reduce the shuffle.
9002   SmallVector<int, 32> BlendMask(Mask.size(), -1);
9003   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9004
9005   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9006     if (Mask[i] < 0)
9007       continue;
9008
9009     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9010
9011     if (BlendMask[Mask[i] % Size] < 0)
9012       BlendMask[Mask[i] % Size] = Mask[i];
9013     else if (BlendMask[Mask[i] % Size] != Mask[i])
9014       return SDValue(); // Can't blend in the needed input!
9015
9016     PermuteMask[i] = Mask[i] % Size;
9017   }
9018
9019   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9020   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9021 }
9022
9023 /// \brief Generic routine to decompose a shuffle and blend into independent
9024 /// blends and permutes.
9025 ///
9026 /// This matches the extremely common pattern for handling combined
9027 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9028 /// operations. It will try to pick the best arrangement of shuffles and
9029 /// blends.
9030 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9031                                                           MVT VT, SDValue V1,
9032                                                           SDValue V2,
9033                                                           ArrayRef<int> Mask,
9034                                                           SelectionDAG &DAG) {
9035   // Shuffle the input elements into the desired positions in V1 and V2 and
9036   // blend them together.
9037   SmallVector<int, 32> V1Mask(Mask.size(), -1);
9038   SmallVector<int, 32> V2Mask(Mask.size(), -1);
9039   SmallVector<int, 32> BlendMask(Mask.size(), -1);
9040   for (int i = 0, Size = Mask.size(); i < Size; ++i)
9041     if (Mask[i] >= 0 && Mask[i] < Size) {
9042       V1Mask[i] = Mask[i];
9043       BlendMask[i] = i;
9044     } else if (Mask[i] >= Size) {
9045       V2Mask[i] = Mask[i] - Size;
9046       BlendMask[i] = i + Size;
9047     }
9048
9049   // Try to lower with the simpler initial blend strategy unless one of the
9050   // input shuffles would be a no-op. We prefer to shuffle inputs as the
9051   // shuffle may be able to fold with a load or other benefit. However, when
9052   // we'll have to do 2x as many shuffles in order to achieve this, blending
9053   // first is a better strategy.
9054   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9055     if (SDValue BlendPerm =
9056             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9057       return BlendPerm;
9058
9059   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9060   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9061   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9062 }
9063
9064 /// \brief Try to lower a vector shuffle as a rotation.
9065 ///
9066 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9067 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9068                                       ArrayRef<int> Mask) {
9069   int NumElts = Mask.size();
9070
9071   // We need to detect various ways of spelling a rotation:
9072   //   [11, 12, 13, 14, 15,  0,  1,  2]
9073   //   [-1, 12, 13, 14, -1, -1,  1, -1]
9074   //   [-1, -1, -1, -1, -1, -1,  1,  2]
9075   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
9076   //   [-1,  4,  5,  6, -1, -1,  9, -1]
9077   //   [-1,  4,  5,  6, -1, -1, -1, -1]
9078   int Rotation = 0;
9079   SDValue Lo, Hi;
9080   for (int i = 0; i < NumElts; ++i) {
9081     int M = Mask[i];
9082     assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9083            "Unexpected mask index.");
9084     if (M < 0)
9085       continue;
9086
9087     // Determine where a rotated vector would have started.
9088     int StartIdx = i - (M % NumElts);
9089     if (StartIdx == 0)
9090       // The identity rotation isn't interesting, stop.
9091       return -1;
9092
9093     // If we found the tail of a vector the rotation must be the missing
9094     // front. If we found the head of a vector, it must be how much of the
9095     // head.
9096     int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9097
9098     if (Rotation == 0)
9099       Rotation = CandidateRotation;
9100     else if (Rotation != CandidateRotation)
9101       // The rotations don't match, so we can't match this mask.
9102       return -1;
9103
9104     // Compute which value this mask is pointing at.
9105     SDValue MaskV = M < NumElts ? V1 : V2;
9106
9107     // Compute which of the two target values this index should be assigned
9108     // to. This reflects whether the high elements are remaining or the low
9109     // elements are remaining.
9110     SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9111
9112     // Either set up this value if we've not encountered it before, or check
9113     // that it remains consistent.
9114     if (!TargetV)
9115       TargetV = MaskV;
9116     else if (TargetV != MaskV)
9117       // This may be a rotation, but it pulls from the inputs in some
9118       // unsupported interleaving.
9119       return -1;
9120   }
9121
9122   // Check that we successfully analyzed the mask, and normalize the results.
9123   assert(Rotation != 0 && "Failed to locate a viable rotation!");
9124   assert((Lo || Hi) && "Failed to find a rotated input vector!");
9125   if (!Lo)
9126     Lo = Hi;
9127   else if (!Hi)
9128     Hi = Lo;
9129
9130   V1 = Lo;
9131   V2 = Hi;
9132
9133   return Rotation;
9134 }
9135
9136 /// \brief Try to lower a vector shuffle as a byte rotation.
9137 ///
9138 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9139 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9140 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9141 /// try to generically lower a vector shuffle through such an pattern. It
9142 /// does not check for the profitability of lowering either as PALIGNR or
9143 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9144 /// This matches shuffle vectors that look like:
9145 ///
9146 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9147 ///
9148 /// Essentially it concatenates V1 and V2, shifts right by some number of
9149 /// elements, and takes the low elements as the result. Note that while this is
9150 /// specified as a *right shift* because x86 is little-endian, it is a *left
9151 /// rotate* of the vector lanes.
9152 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9153                                           ArrayRef<int> Mask) {
9154   // Don't accept any shuffles with zero elements.
9155   if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9156     return -1;
9157
9158   // PALIGNR works on 128-bit lanes.
9159   SmallVector<int, 16> RepeatedMask;
9160   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9161     return -1;
9162
9163   int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9164   if (Rotation <= 0)
9165     return -1;
9166
9167   // PALIGNR rotates bytes, so we need to scale the
9168   // rotation based on how many bytes are in the vector lane.
9169   int NumElts = RepeatedMask.size();
9170   int Scale = 16 / NumElts;
9171   return Rotation * Scale;
9172 }
9173
9174 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9175                                               SDValue V1, SDValue V2,
9176                                               ArrayRef<int> Mask,
9177                                               const X86Subtarget &Subtarget,
9178                                               SelectionDAG &DAG) {
9179   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9180
9181   SDValue Lo = V1, Hi = V2;
9182   int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9183   if (ByteRotation <= 0)
9184     return SDValue();
9185
9186   // Cast the inputs to i8 vector of correct length to match PALIGNR or
9187   // PSLLDQ/PSRLDQ.
9188   MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9189   Lo = DAG.getBitcast(ByteVT, Lo);
9190   Hi = DAG.getBitcast(ByteVT, Hi);
9191
9192   // SSSE3 targets can use the palignr instruction.
9193   if (Subtarget.hasSSSE3()) {
9194     assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9195            "512-bit PALIGNR requires BWI instructions");
9196     return DAG.getBitcast(
9197         VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9198                         DAG.getConstant(ByteRotation, DL, MVT::i8)));
9199   }
9200
9201   assert(VT.is128BitVector() &&
9202          "Rotate-based lowering only supports 128-bit lowering!");
9203   assert(Mask.size() <= 16 &&
9204          "Can shuffle at most 16 bytes in a 128-bit vector!");
9205   assert(ByteVT == MVT::v16i8 &&
9206          "SSE2 rotate lowering only needed for v16i8!");
9207
9208   // Default SSE2 implementation
9209   int LoByteShift = 16 - ByteRotation;
9210   int HiByteShift = ByteRotation;
9211
9212   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9213                                 DAG.getConstant(LoByteShift, DL, MVT::i8));
9214   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9215                                 DAG.getConstant(HiByteShift, DL, MVT::i8));
9216   return DAG.getBitcast(VT,
9217                         DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9218 }
9219
9220 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9221 ///
9222 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9223 /// rotation of the concatenation of two vectors; This routine will
9224 /// try to generically lower a vector shuffle through such an pattern.
9225 ///
9226 /// Essentially it concatenates V1 and V2, shifts right by some number of
9227 /// elements, and takes the low elements as the result. Note that while this is
9228 /// specified as a *right shift* because x86 is little-endian, it is a *left
9229 /// rotate* of the vector lanes.
9230 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9231                                           SDValue V1, SDValue V2,
9232                                           ArrayRef<int> Mask,
9233                                           const X86Subtarget &Subtarget,
9234                                           SelectionDAG &DAG) {
9235   assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9236          "Only 32-bit and 64-bit elements are supported!");
9237
9238   // 128/256-bit vectors are only supported with VLX.
9239   assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9240          && "VLX required for 128/256-bit vectors");
9241
9242   SDValue Lo = V1, Hi = V2;
9243   int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9244   if (Rotation <= 0)
9245     return SDValue();
9246
9247   return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9248                      DAG.getConstant(Rotation, DL, MVT::i8));
9249 }
9250
9251 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9252 ///
9253 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9254 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9255 /// matches elements from one of the input vectors shuffled to the left or
9256 /// right with zeroable elements 'shifted in'. It handles both the strictly
9257 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9258 /// quad word lane.
9259 ///
9260 /// PSHL : (little-endian) left bit shift.
9261 /// [ zz, 0, zz,  2 ]
9262 /// [ -1, 4, zz, -1 ]
9263 /// PSRL : (little-endian) right bit shift.
9264 /// [  1, zz,  3, zz]
9265 /// [ -1, -1,  7, zz]
9266 /// PSLLDQ : (little-endian) left byte shift
9267 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
9268 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
9269 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
9270 /// PSRLDQ : (little-endian) right byte shift
9271 /// [  5, 6,  7, zz, zz, zz, zz, zz]
9272 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
9273 /// [  1, 2, -1, -1, -1, -1, zz, zz]
9274 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9275                                      unsigned ScalarSizeInBits,
9276                                      ArrayRef<int> Mask, int MaskOffset,
9277                                      const APInt &Zeroable,
9278                                      const X86Subtarget &Subtarget) {
9279   int Size = Mask.size();
9280   unsigned SizeInBits = Size * ScalarSizeInBits;
9281
9282   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9283     for (int i = 0; i < Size; i += Scale)
9284       for (int j = 0; j < Shift; ++j)
9285         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9286           return false;
9287
9288     return true;
9289   };
9290
9291   auto MatchShift = [&](int Shift, int Scale, bool Left) {
9292     for (int i = 0; i != Size; i += Scale) {
9293       unsigned Pos = Left ? i + Shift : i;
9294       unsigned Low = Left ? i : i + Shift;
9295       unsigned Len = Scale - Shift;
9296       if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9297         return -1;
9298     }
9299
9300     int ShiftEltBits = ScalarSizeInBits * Scale;
9301     bool ByteShift = ShiftEltBits > 64;
9302     Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9303                   : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9304     int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9305
9306     // Normalize the scale for byte shifts to still produce an i64 element
9307     // type.
9308     Scale = ByteShift ? Scale / 2 : Scale;
9309
9310     // We need to round trip through the appropriate type for the shift.
9311     MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9312     ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9313                         : MVT::getVectorVT(ShiftSVT, Size / Scale);
9314     return (int)ShiftAmt;
9315   };
9316
9317   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9318   // keep doubling the size of the integer elements up to that. We can
9319   // then shift the elements of the integer vector by whole multiples of
9320   // their width within the elements of the larger integer vector. Test each
9321   // multiple to see if we can find a match with the moved element indices
9322   // and that the shifted in elements are all zeroable.
9323   unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9324   for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9325     for (int Shift = 1; Shift != Scale; ++Shift)
9326       for (bool Left : {true, false})
9327         if (CheckZeros(Shift, Scale, Left)) {
9328           int ShiftAmt = MatchShift(Shift, Scale, Left);
9329           if (0 < ShiftAmt)
9330             return ShiftAmt;
9331         }
9332
9333   // no match
9334   return -1;
9335 }
9336
9337 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9338                                          SDValue V2, ArrayRef<int> Mask,
9339                                          const APInt &Zeroable,
9340                                          const X86Subtarget &Subtarget,
9341                                          SelectionDAG &DAG) {
9342   int Size = Mask.size();
9343   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9344
9345   MVT ShiftVT;
9346   SDValue V = V1;
9347   unsigned Opcode;
9348
9349   // Try to match shuffle against V1 shift.
9350   int ShiftAmt = matchVectorShuffleAsShift(
9351       ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9352
9353   // If V1 failed, try to match shuffle against V2 shift.
9354   if (ShiftAmt < 0) {
9355     ShiftAmt =
9356         matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9357                                   Mask, Size, Zeroable, Subtarget);
9358     V = V2;
9359   }
9360
9361   if (ShiftAmt < 0)
9362     return SDValue();
9363
9364   assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9365          "Illegal integer vector type");
9366   V = DAG.getBitcast(ShiftVT, V);
9367   V = DAG.getNode(Opcode, DL, ShiftVT, V,
9368                   DAG.getConstant(ShiftAmt, DL, MVT::i8));
9369   return DAG.getBitcast(VT, V);
9370 }
9371
9372 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9373 // Remainder of lower half result is zero and upper half is all undef.
9374 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9375                                       ArrayRef<int> Mask, uint64_t &BitLen,
9376                                       uint64_t &BitIdx, const APInt &Zeroable) {
9377   int Size = Mask.size();
9378   int HalfSize = Size / 2;
9379   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9380   assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9381
9382   // Upper half must be undefined.
9383   if (!isUndefInRange(Mask, HalfSize, HalfSize))
9384     return false;
9385
9386   // Determine the extraction length from the part of the
9387   // lower half that isn't zeroable.
9388   int Len = HalfSize;
9389   for (; Len > 0; --Len)
9390     if (!Zeroable[Len - 1])
9391       break;
9392   assert(Len > 0 && "Zeroable shuffle mask");
9393
9394   // Attempt to match first Len sequential elements from the lower half.
9395   SDValue Src;
9396   int Idx = -1;
9397   for (int i = 0; i != Len; ++i) {
9398     int M = Mask[i];
9399     if (M == SM_SentinelUndef)
9400       continue;
9401     SDValue &V = (M < Size ? V1 : V2);
9402     M = M % Size;
9403
9404     // The extracted elements must start at a valid index and all mask
9405     // elements must be in the lower half.
9406     if (i > M || M >= HalfSize)
9407       return false;
9408
9409     if (Idx < 0 || (Src == V && Idx == (M - i))) {
9410       Src = V;
9411       Idx = M - i;
9412       continue;
9413     }
9414     return false;
9415   }
9416
9417   if (!Src || Idx < 0)
9418     return false;
9419
9420   assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9421   BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9422   BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9423   V1 = Src;
9424   return true;
9425 }
9426
9427 // INSERTQ: Extract lowest Len elements from lower half of second source and
9428 // insert over first source, starting at Idx.
9429 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9430 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9431                                         ArrayRef<int> Mask, uint64_t &BitLen,
9432                                         uint64_t &BitIdx) {
9433   int Size = Mask.size();
9434   int HalfSize = Size / 2;
9435   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9436
9437   // Upper half must be undefined.
9438   if (!isUndefInRange(Mask, HalfSize, HalfSize))
9439     return false;
9440
9441   for (int Idx = 0; Idx != HalfSize; ++Idx) {
9442     SDValue Base;
9443
9444     // Attempt to match first source from mask before insertion point.
9445     if (isUndefInRange(Mask, 0, Idx)) {
9446       /* EMPTY */
9447     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9448       Base = V1;
9449     } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9450       Base = V2;
9451     } else {
9452       continue;
9453     }
9454
9455     // Extend the extraction length looking to match both the insertion of
9456     // the second source and the remaining elements of the first.
9457     for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9458       SDValue Insert;
9459       int Len = Hi - Idx;
9460
9461       // Match insertion.
9462       if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9463         Insert = V1;
9464       } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9465         Insert = V2;
9466       } else {
9467         continue;
9468       }
9469
9470       // Match the remaining elements of the lower half.
9471       if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9472         /* EMPTY */
9473       } else if ((!Base || (Base == V1)) &&
9474                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9475         Base = V1;
9476       } else if ((!Base || (Base == V2)) &&
9477                  isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9478                                             Size + Hi)) {
9479         Base = V2;
9480       } else {
9481         continue;
9482       }
9483
9484       BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9485       BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9486       V1 = Base;
9487       V2 = Insert;
9488       return true;
9489     }
9490   }
9491
9492   return false;
9493 }
9494
9495 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9496 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9497                                            SDValue V2, ArrayRef<int> Mask,
9498                                            const APInt &Zeroable,
9499                                            SelectionDAG &DAG) {
9500   uint64_t BitLen, BitIdx;
9501   if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9502     return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9503                        DAG.getConstant(BitLen, DL, MVT::i8),
9504                        DAG.getConstant(BitIdx, DL, MVT::i8));
9505
9506   if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9507     return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9508                        V2 ? V2 : DAG.getUNDEF(VT),
9509                        DAG.getConstant(BitLen, DL, MVT::i8),
9510                        DAG.getConstant(BitIdx, DL, MVT::i8));
9511
9512   return SDValue();
9513 }
9514
9515 /// \brief Lower a vector shuffle as a zero or any extension.
9516 ///
9517 /// Given a specific number of elements, element bit width, and extension
9518 /// stride, produce either a zero or any extension based on the available
9519 /// features of the subtarget. The extended elements are consecutive and
9520 /// begin and can start from an offsetted element index in the input; to
9521 /// avoid excess shuffling the offset must either being in the bottom lane
9522 /// or at the start of a higher lane. All extended elements must be from
9523 /// the same lane.
9524 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9525     const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9526     ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9527   assert(Scale > 1 && "Need a scale to extend.");
9528   int EltBits = VT.getScalarSizeInBits();
9529   int NumElements = VT.getVectorNumElements();
9530   int NumEltsPerLane = 128 / EltBits;
9531   int OffsetLane = Offset / NumEltsPerLane;
9532   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9533          "Only 8, 16, and 32 bit elements can be extended.");
9534   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9535   assert(0 <= Offset && "Extension offset must be positive.");
9536   assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9537          "Extension offset must be in the first lane or start an upper lane.");
9538
9539   // Check that an index is in same lane as the base offset.
9540   auto SafeOffset = [&](int Idx) {
9541     return OffsetLane == (Idx / NumEltsPerLane);
9542   };
9543
9544   // Shift along an input so that the offset base moves to the first element.
9545   auto ShuffleOffset = [&](SDValue V) {
9546     if (!Offset)
9547       return V;
9548
9549     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9550     for (int i = 0; i * Scale < NumElements; ++i) {
9551       int SrcIdx = i + Offset;
9552       ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9553     }
9554     return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9555   };
9556
9557   // Found a valid zext mask! Try various lowering strategies based on the
9558   // input type and available ISA extensions.
9559   if (Subtarget.hasSSE41()) {
9560     // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9561     // PUNPCK will catch this in a later shuffle match.
9562     if (Offset && Scale == 2 && VT.is128BitVector())
9563       return SDValue();
9564     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9565                                  NumElements / Scale);
9566     InputV = ShuffleOffset(InputV);
9567     InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9568     return DAG.getBitcast(VT, InputV);
9569   }
9570
9571   assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9572
9573   // For any extends we can cheat for larger element sizes and use shuffle
9574   // instructions that can fold with a load and/or copy.
9575   if (AnyExt && EltBits == 32) {
9576     int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9577                          -1};
9578     return DAG.getBitcast(
9579         VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9580                         DAG.getBitcast(MVT::v4i32, InputV),
9581                         getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9582   }
9583   if (AnyExt && EltBits == 16 && Scale > 2) {
9584     int PSHUFDMask[4] = {Offset / 2, -1,
9585                          SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9586     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9587                          DAG.getBitcast(MVT::v4i32, InputV),
9588                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9589     int PSHUFWMask[4] = {1, -1, -1, -1};
9590     unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9591     return DAG.getBitcast(
9592         VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9593                         DAG.getBitcast(MVT::v8i16, InputV),
9594                         getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9595   }
9596
9597   // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9598   // to 64-bits.
9599   if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9600     assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9601     assert(VT.is128BitVector() && "Unexpected vector width!");
9602
9603     int LoIdx = Offset * EltBits;
9604     SDValue Lo = DAG.getBitcast(
9605         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9606                                 DAG.getConstant(EltBits, DL, MVT::i8),
9607                                 DAG.getConstant(LoIdx, DL, MVT::i8)));
9608
9609     if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9610         !SafeOffset(Offset + 1))
9611       return DAG.getBitcast(VT, Lo);
9612
9613     int HiIdx = (Offset + 1) * EltBits;
9614     SDValue Hi = DAG.getBitcast(
9615         MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9616                                 DAG.getConstant(EltBits, DL, MVT::i8),
9617                                 DAG.getConstant(HiIdx, DL, MVT::i8)));
9618     return DAG.getBitcast(VT,
9619                           DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9620   }
9621
9622   // If this would require more than 2 unpack instructions to expand, use
9623   // pshufb when available. We can only use more than 2 unpack instructions
9624   // when zero extending i8 elements which also makes it easier to use pshufb.
9625   if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9626     assert(NumElements == 16 && "Unexpected byte vector width!");
9627     SDValue PSHUFBMask[16];
9628     for (int i = 0; i < 16; ++i) {
9629       int Idx = Offset + (i / Scale);
9630       PSHUFBMask[i] = DAG.getConstant(
9631           (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9632     }
9633     InputV = DAG.getBitcast(MVT::v16i8, InputV);
9634     return DAG.getBitcast(
9635         VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9636                         DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9637   }
9638
9639   // If we are extending from an offset, ensure we start on a boundary that
9640   // we can unpack from.
9641   int AlignToUnpack = Offset % (NumElements / Scale);
9642   if (AlignToUnpack) {
9643     SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9644     for (int i = AlignToUnpack; i < NumElements; ++i)
9645       ShMask[i - AlignToUnpack] = i;
9646     InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9647     Offset -= AlignToUnpack;
9648   }
9649
9650   // Otherwise emit a sequence of unpacks.
9651   do {
9652     unsigned UnpackLoHi = X86ISD::UNPCKL;
9653     if (Offset >= (NumElements / 2)) {
9654       UnpackLoHi = X86ISD::UNPCKH;
9655       Offset -= (NumElements / 2);
9656     }
9657
9658     MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9659     SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9660                          : getZeroVector(InputVT, Subtarget, DAG, DL);
9661     InputV = DAG.getBitcast(InputVT, InputV);
9662     InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9663     Scale /= 2;
9664     EltBits *= 2;
9665     NumElements /= 2;
9666   } while (Scale > 1);
9667   return DAG.getBitcast(VT, InputV);
9668 }
9669
9670 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9671 ///
9672 /// This routine will try to do everything in its power to cleverly lower
9673 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9674 /// check for the profitability of this lowering,  it tries to aggressively
9675 /// match this pattern. It will use all of the micro-architectural details it
9676 /// can to emit an efficient lowering. It handles both blends with all-zero
9677 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9678 /// masking out later).
9679 ///
9680 /// The reason we have dedicated lowering for zext-style shuffles is that they
9681 /// are both incredibly common and often quite performance sensitive.
9682 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9683     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9684     const APInt &Zeroable, const X86Subtarget &Subtarget,
9685     SelectionDAG &DAG) {
9686   int Bits = VT.getSizeInBits();
9687   int NumLanes = Bits / 128;
9688   int NumElements = VT.getVectorNumElements();
9689   int NumEltsPerLane = NumElements / NumLanes;
9690   assert(VT.getScalarSizeInBits() <= 32 &&
9691          "Exceeds 32-bit integer zero extension limit");
9692   assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9693
9694   // Define a helper function to check a particular ext-scale and lower to it if
9695   // valid.
9696   auto Lower = [&](int Scale) -> SDValue {
9697     SDValue InputV;
9698     bool AnyExt = true;
9699     int Offset = 0;
9700     int Matches = 0;
9701     for (int i = 0; i < NumElements; ++i) {
9702       int M = Mask[i];
9703       if (M < 0)
9704         continue; // Valid anywhere but doesn't tell us anything.
9705       if (i % Scale != 0) {
9706         // Each of the extended elements need to be zeroable.
9707         if (!Zeroable[i])
9708           return SDValue();
9709
9710         // We no longer are in the anyext case.
9711         AnyExt = false;
9712         continue;
9713       }
9714
9715       // Each of the base elements needs to be consecutive indices into the
9716       // same input vector.
9717       SDValue V = M < NumElements ? V1 : V2;
9718       M = M % NumElements;
9719       if (!InputV) {
9720         InputV = V;
9721         Offset = M - (i / Scale);
9722       } else if (InputV != V)
9723         return SDValue(); // Flip-flopping inputs.
9724
9725       // Offset must start in the lowest 128-bit lane or at the start of an
9726       // upper lane.
9727       // FIXME: Is it ever worth allowing a negative base offset?
9728       if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9729             (Offset % NumEltsPerLane) == 0))
9730         return SDValue();
9731
9732       // If we are offsetting, all referenced entries must come from the same
9733       // lane.
9734       if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9735         return SDValue();
9736
9737       if ((M % NumElements) != (Offset + (i / Scale)))
9738         return SDValue(); // Non-consecutive strided elements.
9739       Matches++;
9740     }
9741
9742     // If we fail to find an input, we have a zero-shuffle which should always
9743     // have already been handled.
9744     // FIXME: Maybe handle this here in case during blending we end up with one?
9745     if (!InputV)
9746       return SDValue();
9747
9748     // If we are offsetting, don't extend if we only match a single input, we
9749     // can always do better by using a basic PSHUF or PUNPCK.
9750     if (Offset != 0 && Matches < 2)
9751       return SDValue();
9752
9753     return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9754         DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9755   };
9756
9757   // The widest scale possible for extending is to a 64-bit integer.
9758   assert(Bits % 64 == 0 &&
9759          "The number of bits in a vector must be divisible by 64 on x86!");
9760   int NumExtElements = Bits / 64;
9761
9762   // Each iteration, try extending the elements half as much, but into twice as
9763   // many elements.
9764   for (; NumExtElements < NumElements; NumExtElements *= 2) {
9765     assert(NumElements % NumExtElements == 0 &&
9766            "The input vector size must be divisible by the extended size.");
9767     if (SDValue V = Lower(NumElements / NumExtElements))
9768       return V;
9769   }
9770
9771   // General extends failed, but 128-bit vectors may be able to use MOVQ.
9772   if (Bits != 128)
9773     return SDValue();
9774
9775   // Returns one of the source operands if the shuffle can be reduced to a
9776   // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9777   auto CanZExtLowHalf = [&]() {
9778     for (int i = NumElements / 2; i != NumElements; ++i)
9779       if (!Zeroable[i])
9780         return SDValue();
9781     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9782       return V1;
9783     if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9784       return V2;
9785     return SDValue();
9786   };
9787
9788   if (SDValue V = CanZExtLowHalf()) {
9789     V = DAG.getBitcast(MVT::v2i64, V);
9790     V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9791     return DAG.getBitcast(VT, V);
9792   }
9793
9794   // No viable ext lowering found.
9795   return SDValue();
9796 }
9797
9798 /// \brief Try to get a scalar value for a specific element of a vector.
9799 ///
9800 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9801 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9802                                               SelectionDAG &DAG) {
9803   MVT VT = V.getSimpleValueType();
9804   MVT EltVT = VT.getVectorElementType();
9805   V = peekThroughBitcasts(V);
9806
9807   // If the bitcasts shift the element size, we can't extract an equivalent
9808   // element from it.
9809   MVT NewVT = V.getSimpleValueType();
9810   if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9811     return SDValue();
9812
9813   if (V.getOpcode() == ISD::BUILD_VECTOR ||
9814       (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9815     // Ensure the scalar operand is the same size as the destination.
9816     // FIXME: Add support for scalar truncation where possible.
9817     SDValue S = V.getOperand(Idx);
9818     if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9819       return DAG.getBitcast(EltVT, S);
9820   }
9821
9822   return SDValue();
9823 }
9824
9825 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9826 ///
9827 /// This is particularly important because the set of instructions varies
9828 /// significantly based on whether the operand is a load or not.
9829 static bool isShuffleFoldableLoad(SDValue V) {
9830   V = peekThroughBitcasts(V);
9831   return ISD::isNON_EXTLoad(V.getNode());
9832 }
9833
9834 /// \brief Try to lower insertion of a single element into a zero vector.
9835 ///
9836 /// This is a common pattern that we have especially efficient patterns to lower
9837 /// across all subtarget feature sets.
9838 static SDValue lowerVectorShuffleAsElementInsertion(
9839     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9840     const APInt &Zeroable, const X86Subtarget &Subtarget,
9841     SelectionDAG &DAG) {
9842   MVT ExtVT = VT;
9843   MVT EltVT = VT.getVectorElementType();
9844
9845   int V2Index =
9846       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9847       Mask.begin();
9848   bool IsV1Zeroable = true;
9849   for (int i = 0, Size = Mask.size(); i < Size; ++i)
9850     if (i != V2Index && !Zeroable[i]) {
9851       IsV1Zeroable = false;
9852       break;
9853     }
9854
9855   // Check for a single input from a SCALAR_TO_VECTOR node.
9856   // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9857   // all the smarts here sunk into that routine. However, the current
9858   // lowering of BUILD_VECTOR makes that nearly impossible until the old
9859   // vector shuffle lowering is dead.
9860   SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9861                                                DAG);
9862   if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9863     // We need to zext the scalar if it is smaller than an i32.
9864     V2S = DAG.getBitcast(EltVT, V2S);
9865     if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9866       // Using zext to expand a narrow element won't work for non-zero
9867       // insertions.
9868       if (!IsV1Zeroable)
9869         return SDValue();
9870
9871       // Zero-extend directly to i32.
9872       ExtVT = MVT::v4i32;
9873       V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9874     }
9875     V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9876   } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9877              EltVT == MVT::i16) {
9878     // Either not inserting from the low element of the input or the input
9879     // element size is too small to use VZEXT_MOVL to clear the high bits.
9880     return SDValue();
9881   }
9882
9883   if (!IsV1Zeroable) {
9884     // If V1 can't be treated as a zero vector we have fewer options to lower
9885     // this. We can't support integer vectors or non-zero targets cheaply, and
9886     // the V1 elements can't be permuted in any way.
9887     assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9888     if (!VT.isFloatingPoint() || V2Index != 0)
9889       return SDValue();
9890     SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9891     V1Mask[V2Index] = -1;
9892     if (!isNoopShuffleMask(V1Mask))
9893       return SDValue();
9894     // This is essentially a special case blend operation, but if we have
9895     // general purpose blend operations, they are always faster. Bail and let
9896     // the rest of the lowering handle these as blends.
9897     if (Subtarget.hasSSE41())
9898       return SDValue();
9899
9900     // Otherwise, use MOVSD or MOVSS.
9901     assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9902            "Only two types of floating point element types to handle!");
9903     return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9904                        ExtVT, V1, V2);
9905   }
9906
9907   // This lowering only works for the low element with floating point vectors.
9908   if (VT.isFloatingPoint() && V2Index != 0)
9909     return SDValue();
9910
9911   V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9912   if (ExtVT != VT)
9913     V2 = DAG.getBitcast(VT, V2);
9914
9915   if (V2Index != 0) {
9916     // If we have 4 or fewer lanes we can cheaply shuffle the element into
9917     // the desired position. Otherwise it is more efficient to do a vector
9918     // shift left. We know that we can do a vector shift left because all
9919     // the inputs are zero.
9920     if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9921       SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9922       V2Shuffle[V2Index] = 0;
9923       V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9924     } else {
9925       V2 = DAG.getBitcast(MVT::v16i8, V2);
9926       V2 = DAG.getNode(
9927           X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9928           DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9929                           DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9930                               DAG.getDataLayout(), VT)));
9931       V2 = DAG.getBitcast(VT, V2);
9932     }
9933   }
9934   return V2;
9935 }
9936
9937 /// Try to lower broadcast of a single - truncated - integer element,
9938 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9939 ///
9940 /// This assumes we have AVX2.
9941 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9942                                                   SDValue V0, int BroadcastIdx,
9943                                                   const X86Subtarget &Subtarget,
9944                                                   SelectionDAG &DAG) {
9945   assert(Subtarget.hasAVX2() &&
9946          "We can only lower integer broadcasts with AVX2!");
9947
9948   EVT EltVT = VT.getVectorElementType();
9949   EVT V0VT = V0.getValueType();
9950
9951   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9952   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9953
9954   EVT V0EltVT = V0VT.getVectorElementType();
9955   if (!V0EltVT.isInteger())
9956     return SDValue();
9957
9958   const unsigned EltSize = EltVT.getSizeInBits();
9959   const unsigned V0EltSize = V0EltVT.getSizeInBits();
9960
9961   // This is only a truncation if the original element type is larger.
9962   if (V0EltSize <= EltSize)
9963     return SDValue();
9964
9965   assert(((V0EltSize % EltSize) == 0) &&
9966          "Scalar type sizes must all be powers of 2 on x86!");
9967
9968   const unsigned V0Opc = V0.getOpcode();
9969   const unsigned Scale = V0EltSize / EltSize;
9970   const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9971
9972   if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9973       V0Opc != ISD::BUILD_VECTOR)
9974     return SDValue();
9975
9976   SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9977
9978   // If we're extracting non-least-significant bits, shift so we can truncate.
9979   // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9980   // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9981   // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9982   if (const int OffsetIdx = BroadcastIdx % Scale)
9983     Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9984             DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9985
9986   return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9987                      DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9988 }
9989
9990 /// \brief Try to lower broadcast of a single element.
9991 ///
9992 /// For convenience, this code also bundles all of the subtarget feature set
9993 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9994 /// a convenient way to factor it out.
9995 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9996                                              SDValue V1, SDValue V2,
9997                                              ArrayRef<int> Mask,
9998                                              const X86Subtarget &Subtarget,
9999                                              SelectionDAG &DAG) {
10000   if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10001         (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10002         (Subtarget.hasAVX2() && VT.isInteger())))
10003     return SDValue();
10004
10005   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10006   // we can only broadcast from a register with AVX2.
10007   unsigned NumElts = Mask.size();
10008   unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
10009   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10010
10011   // Check that the mask is a broadcast.
10012   int BroadcastIdx = -1;
10013   for (int i = 0; i != (int)NumElts; ++i) {
10014     SmallVector<int, 8> BroadcastMask(NumElts, i);
10015     if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10016       BroadcastIdx = i;
10017       break;
10018     }
10019   }
10020
10021   if (BroadcastIdx < 0)
10022     return SDValue();
10023   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10024                                             "a sorted mask where the broadcast "
10025                                             "comes from V1.");
10026
10027   // Go up the chain of (vector) values to find a scalar load that we can
10028   // combine with the broadcast.
10029   SDValue V = V1;
10030   for (;;) {
10031     switch (V.getOpcode()) {
10032     case ISD::BITCAST: {
10033       SDValue VSrc = V.getOperand(0);
10034       MVT SrcVT = VSrc.getSimpleValueType();
10035       if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10036         break;
10037       V = VSrc;
10038       continue;
10039     }
10040     case ISD::CONCAT_VECTORS: {
10041       int OperandSize = Mask.size() / V.getNumOperands();
10042       V = V.getOperand(BroadcastIdx / OperandSize);
10043       BroadcastIdx %= OperandSize;
10044       continue;
10045     }
10046     case ISD::INSERT_SUBVECTOR: {
10047       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10048       auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10049       if (!ConstantIdx)
10050         break;
10051
10052       int BeginIdx = (int)ConstantIdx->getZExtValue();
10053       int EndIdx =
10054           BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10055       if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10056         BroadcastIdx -= BeginIdx;
10057         V = VInner;
10058       } else {
10059         V = VOuter;
10060       }
10061       continue;
10062     }
10063     }
10064     break;
10065   }
10066
10067   // Check if this is a broadcast of a scalar. We special case lowering
10068   // for scalars so that we can more effectively fold with loads.
10069   // First, look through bitcast: if the original value has a larger element
10070   // type than the shuffle, the broadcast element is in essence truncated.
10071   // Make that explicit to ease folding.
10072   if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10073     if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10074             DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10075       return TruncBroadcast;
10076
10077   MVT BroadcastVT = VT;
10078
10079   // Peek through any bitcast (only useful for loads).
10080   SDValue BC = peekThroughBitcasts(V);
10081
10082   // Also check the simpler case, where we can directly reuse the scalar.
10083   if (V.getOpcode() == ISD::BUILD_VECTOR ||
10084       (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10085     V = V.getOperand(BroadcastIdx);
10086
10087     // If we can't broadcast from a register, check that the input is a load.
10088     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10089       return SDValue();
10090   } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10091     // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10092     if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10093       BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10094       Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
10095     }
10096
10097     // If we are broadcasting a load that is only used by the shuffle
10098     // then we can reduce the vector load to the broadcasted scalar load.
10099     LoadSDNode *Ld = cast<LoadSDNode>(BC);
10100     SDValue BaseAddr = Ld->getOperand(1);
10101     EVT SVT = BroadcastVT.getScalarType();
10102     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10103     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10104     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10105                     DAG.getMachineFunction().getMachineMemOperand(
10106                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10107     DAG.makeEquivalentMemoryOrdering(Ld, V);
10108   } else if (!BroadcastFromReg) {
10109     // We can't broadcast from a vector register.
10110     return SDValue();
10111   } else if (BroadcastIdx != 0) {
10112     // We can only broadcast from the zero-element of a vector register,
10113     // but it can be advantageous to broadcast from the zero-element of a
10114     // subvector.
10115     if (!VT.is256BitVector() && !VT.is512BitVector())
10116       return SDValue();
10117
10118     // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10119     if (VT == MVT::v4f64 || VT == MVT::v4i64)
10120       return SDValue();
10121
10122     // Only broadcast the zero-element of a 128-bit subvector.
10123     unsigned EltSize = VT.getScalarSizeInBits();
10124     if (((BroadcastIdx * EltSize) % 128) != 0)
10125       return SDValue();
10126
10127     // The shuffle input might have been a bitcast we looked through; look at
10128     // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
10129     // later bitcast it to BroadcastVT.
10130     MVT SrcVT = V.getSimpleValueType();
10131     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10132            "Unexpected vector element size");
10133     assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10134            "Unexpected vector size");
10135
10136     MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10137     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10138                     DAG.getIntPtrConstant(BroadcastIdx, DL));
10139   }
10140
10141   if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10142     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10143                     DAG.getBitcast(MVT::f64, V));
10144
10145   // Bitcast back to the same scalar type as BroadcastVT.
10146   MVT SrcVT = V.getSimpleValueType();
10147   if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10148     assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10149            "Unexpected vector element size");
10150     if (SrcVT.isVector()) {
10151       unsigned NumSrcElts = SrcVT.getVectorNumElements();
10152       SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10153     } else {
10154       SrcVT = BroadcastVT.getScalarType();
10155     }
10156     V = DAG.getBitcast(SrcVT, V);
10157   }
10158
10159   // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10160   if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10161     V = DAG.getBitcast(MVT::f64, V);
10162     unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10163     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10164   }
10165
10166   // We only support broadcasting from 128-bit vectors to minimize the
10167   // number of patterns we need to deal with in isel. So extract down to
10168   // 128-bits.
10169   if (SrcVT.getSizeInBits() > 128)
10170     V = extract128BitVector(V, 0, DAG, DL);
10171
10172   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10173 }
10174
10175 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10176 // INSERTPS when the V1 elements are already in the correct locations
10177 // because otherwise we can just always use two SHUFPS instructions which
10178 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10179 // perform INSERTPS if a single V1 element is out of place and all V2
10180 // elements are zeroable.
10181 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10182                                          unsigned &InsertPSMask,
10183                                          const APInt &Zeroable,
10184                                          ArrayRef<int> Mask,
10185                                          SelectionDAG &DAG) {
10186   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10187   assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10188   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10189
10190   // Attempt to match INSERTPS with one element from VA or VB being
10191   // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10192   // are updated.
10193   auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10194                              ArrayRef<int> CandidateMask) {
10195     unsigned ZMask = 0;
10196     int VADstIndex = -1;
10197     int VBDstIndex = -1;
10198     bool VAUsedInPlace = false;
10199
10200     for (int i = 0; i < 4; ++i) {
10201       // Synthesize a zero mask from the zeroable elements (includes undefs).
10202       if (Zeroable[i]) {
10203         ZMask |= 1 << i;
10204         continue;
10205       }
10206
10207       // Flag if we use any VA inputs in place.
10208       if (i == CandidateMask[i]) {
10209         VAUsedInPlace = true;
10210         continue;
10211       }
10212
10213       // We can only insert a single non-zeroable element.
10214       if (VADstIndex >= 0 || VBDstIndex >= 0)
10215         return false;
10216
10217       if (CandidateMask[i] < 4) {
10218         // VA input out of place for insertion.
10219         VADstIndex = i;
10220       } else {
10221         // VB input for insertion.
10222         VBDstIndex = i;
10223       }
10224     }
10225
10226     // Don't bother if we have no (non-zeroable) element for insertion.
10227     if (VADstIndex < 0 && VBDstIndex < 0)
10228       return false;
10229
10230     // Determine element insertion src/dst indices. The src index is from the
10231     // start of the inserted vector, not the start of the concatenated vector.
10232     unsigned VBSrcIndex = 0;
10233     if (VADstIndex >= 0) {
10234       // If we have a VA input out of place, we use VA as the V2 element
10235       // insertion and don't use the original V2 at all.
10236       VBSrcIndex = CandidateMask[VADstIndex];
10237       VBDstIndex = VADstIndex;
10238       VB = VA;
10239     } else {
10240       VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10241     }
10242
10243     // If no V1 inputs are used in place, then the result is created only from
10244     // the zero mask and the V2 insertion - so remove V1 dependency.
10245     if (!VAUsedInPlace)
10246       VA = DAG.getUNDEF(MVT::v4f32);
10247
10248     // Update V1, V2 and InsertPSMask accordingly.
10249     V1 = VA;
10250     V2 = VB;
10251
10252     // Insert the V2 element into the desired position.
10253     InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10254     assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10255     return true;
10256   };
10257
10258   if (matchAsInsertPS(V1, V2, Mask))
10259     return true;
10260
10261   // Commute and try again.
10262   SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10263   ShuffleVectorSDNode::commuteMask(CommutedMask);
10264   if (matchAsInsertPS(V2, V1, CommutedMask))
10265     return true;
10266
10267   return false;
10268 }
10269
10270 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10271                                             SDValue V2, ArrayRef<int> Mask,
10272                                             const APInt &Zeroable,
10273                                             SelectionDAG &DAG) {
10274   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10275   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10276
10277   // Attempt to match the insertps pattern.
10278   unsigned InsertPSMask;
10279   if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10280     return SDValue();
10281
10282   // Insert the V2 element into the desired position.
10283   return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10284                      DAG.getConstant(InsertPSMask, DL, MVT::i8));
10285 }
10286
10287 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10288 /// UNPCK instruction.
10289 ///
10290 /// This specifically targets cases where we end up with alternating between
10291 /// the two inputs, and so can permute them into something that feeds a single
10292 /// UNPCK instruction. Note that this routine only targets integer vectors
10293 /// because for floating point vectors we have a generalized SHUFPS lowering
10294 /// strategy that handles everything that doesn't *exactly* match an unpack,
10295 /// making this clever lowering unnecessary.
10296 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10297                                                     SDValue V1, SDValue V2,
10298                                                     ArrayRef<int> Mask,
10299                                                     SelectionDAG &DAG) {
10300   assert(!VT.isFloatingPoint() &&
10301          "This routine only supports integer vectors.");
10302   assert(VT.is128BitVector() &&
10303          "This routine only works on 128-bit vectors.");
10304   assert(!V2.isUndef() &&
10305          "This routine should only be used when blending two inputs.");
10306   assert(Mask.size() >= 2 && "Single element masks are invalid.");
10307
10308   int Size = Mask.size();
10309
10310   int NumLoInputs =
10311       count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10312   int NumHiInputs =
10313       count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10314
10315   bool UnpackLo = NumLoInputs >= NumHiInputs;
10316
10317   auto TryUnpack = [&](int ScalarSize, int Scale) {
10318     SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10319     SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10320
10321     for (int i = 0; i < Size; ++i) {
10322       if (Mask[i] < 0)
10323         continue;
10324
10325       // Each element of the unpack contains Scale elements from this mask.
10326       int UnpackIdx = i / Scale;
10327
10328       // We only handle the case where V1 feeds the first slots of the unpack.
10329       // We rely on canonicalization to ensure this is the case.
10330       if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10331         return SDValue();
10332
10333       // Setup the mask for this input. The indexing is tricky as we have to
10334       // handle the unpack stride.
10335       SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10336       VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10337           Mask[i] % Size;
10338     }
10339
10340     // If we will have to shuffle both inputs to use the unpack, check whether
10341     // we can just unpack first and shuffle the result. If so, skip this unpack.
10342     if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10343         !isNoopShuffleMask(V2Mask))
10344       return SDValue();
10345
10346     // Shuffle the inputs into place.
10347     V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10348     V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10349
10350     // Cast the inputs to the type we will use to unpack them.
10351     MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10352     V1 = DAG.getBitcast(UnpackVT, V1);
10353     V2 = DAG.getBitcast(UnpackVT, V2);
10354
10355     // Unpack the inputs and cast the result back to the desired type.
10356     return DAG.getBitcast(
10357         VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10358                         UnpackVT, V1, V2));
10359   };
10360
10361   // We try each unpack from the largest to the smallest to try and find one
10362   // that fits this mask.
10363   int OrigScalarSize = VT.getScalarSizeInBits();
10364   for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10365     if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10366       return Unpack;
10367
10368   // If none of the unpack-rooted lowerings worked (or were profitable) try an
10369   // initial unpack.
10370   if (NumLoInputs == 0 || NumHiInputs == 0) {
10371     assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10372            "We have to have *some* inputs!");
10373     int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10374
10375     // FIXME: We could consider the total complexity of the permute of each
10376     // possible unpacking. Or at the least we should consider how many
10377     // half-crossings are created.
10378     // FIXME: We could consider commuting the unpacks.
10379
10380     SmallVector<int, 32> PermMask((unsigned)Size, -1);
10381     for (int i = 0; i < Size; ++i) {
10382       if (Mask[i] < 0)
10383         continue;
10384
10385       assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10386
10387       PermMask[i] =
10388           2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10389     }
10390     return DAG.getVectorShuffle(
10391         VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10392                             DL, VT, V1, V2),
10393         DAG.getUNDEF(VT), PermMask);
10394   }
10395
10396   return SDValue();
10397 }
10398
10399 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10400 ///
10401 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10402 /// support for floating point shuffles but not integer shuffles. These
10403 /// instructions will incur a domain crossing penalty on some chips though so
10404 /// it is better to avoid lowering through this for integer vectors where
10405 /// possible.
10406 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10407                                        const APInt &Zeroable,
10408                                        SDValue V1, SDValue V2,
10409                                        const X86Subtarget &Subtarget,
10410                                        SelectionDAG &DAG) {
10411   assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10412   assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10413   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10414
10415   if (V2.isUndef()) {
10416     // Check for being able to broadcast a single element.
10417     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10418             DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10419       return Broadcast;
10420
10421     // Straight shuffle of a single input vector. Simulate this by using the
10422     // single input as both of the "inputs" to this instruction..
10423     unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10424
10425     if (Subtarget.hasAVX()) {
10426       // If we have AVX, we can use VPERMILPS which will allow folding a load
10427       // into the shuffle.
10428       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10429                          DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10430     }
10431
10432     return DAG.getNode(
10433         X86ISD::SHUFP, DL, MVT::v2f64,
10434         Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10435         Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10436         DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10437   }
10438   assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10439   assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10440
10441   // If we have a single input, insert that into V1 if we can do so cheaply.
10442   if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10443     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10444             DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10445       return Insertion;
10446     // Try inverting the insertion since for v2 masks it is easy to do and we
10447     // can't reliably sort the mask one way or the other.
10448     int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10449                           Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10450     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10451             DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10452       return Insertion;
10453   }
10454
10455   // Try to use one of the special instruction patterns to handle two common
10456   // blend patterns if a zero-blend above didn't work.
10457   if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10458       isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10459     if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10460       // We can either use a special instruction to load over the low double or
10461       // to move just the low double.
10462       return DAG.getNode(
10463           isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10464           DL, MVT::v2f64, V2,
10465           DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10466
10467   if (Subtarget.hasSSE41())
10468     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10469                                                   Zeroable, Subtarget, DAG))
10470       return Blend;
10471
10472   // Use dedicated unpack instructions for masks that match their pattern.
10473   if (SDValue V =
10474           lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10475     return V;
10476
10477   unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10478   return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10479                      DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10480 }
10481
10482 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10483 ///
10484 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10485 /// the integer unit to minimize domain crossing penalties. However, for blends
10486 /// it falls back to the floating point shuffle operation with appropriate bit
10487 /// casting.
10488 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10489                                        const APInt &Zeroable,
10490                                        SDValue V1, SDValue V2,
10491                                        const X86Subtarget &Subtarget,
10492                                        SelectionDAG &DAG) {
10493   assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10494   assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10495   assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10496
10497   if (V2.isUndef()) {
10498     // Check for being able to broadcast a single element.
10499     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10500             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10501       return Broadcast;
10502
10503     // Straight shuffle of a single input vector. For everything from SSE2
10504     // onward this has a single fast instruction with no scary immediates.
10505     // We have to map the mask as it is actually a v4i32 shuffle instruction.
10506     V1 = DAG.getBitcast(MVT::v4i32, V1);
10507     int WidenedMask[4] = {
10508         std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10509         std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10510     return DAG.getBitcast(
10511         MVT::v2i64,
10512         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10513                     getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10514   }
10515   assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10516   assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10517   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10518   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10519
10520   // If we have a blend of two same-type PACKUS operations and the blend aligns
10521   // with the low and high halves, we can just merge the PACKUS operations.
10522   // This is particularly important as it lets us merge shuffles that this
10523   // routine itself creates.
10524   auto GetPackNode = [](SDValue V) {
10525     V = peekThroughBitcasts(V);
10526     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10527   };
10528   if (SDValue V1Pack = GetPackNode(V1))
10529     if (SDValue V2Pack = GetPackNode(V2)) {
10530       EVT PackVT = V1Pack.getValueType();
10531       if (PackVT == V2Pack.getValueType())
10532         return DAG.getBitcast(MVT::v2i64,
10533                               DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10534                                           Mask[0] == 0 ? V1Pack.getOperand(0)
10535                                                        : V1Pack.getOperand(1),
10536                                           Mask[1] == 2 ? V2Pack.getOperand(0)
10537                                                        : V2Pack.getOperand(1)));
10538     }
10539
10540   // Try to use shift instructions.
10541   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10542                                                 Zeroable, Subtarget, DAG))
10543     return Shift;
10544
10545   // When loading a scalar and then shuffling it into a vector we can often do
10546   // the insertion cheaply.
10547   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10548           DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10549     return Insertion;
10550   // Try inverting the insertion since for v2 masks it is easy to do and we
10551   // can't reliably sort the mask one way or the other.
10552   int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10553   if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10554           DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10555     return Insertion;
10556
10557   // We have different paths for blend lowering, but they all must use the
10558   // *exact* same predicate.
10559   bool IsBlendSupported = Subtarget.hasSSE41();
10560   if (IsBlendSupported)
10561     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10562                                                   Zeroable, Subtarget, DAG))
10563       return Blend;
10564
10565   // Use dedicated unpack instructions for masks that match their pattern.
10566   if (SDValue V =
10567           lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10568     return V;
10569
10570   // Try to use byte rotation instructions.
10571   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10572   if (Subtarget.hasSSSE3())
10573     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10574             DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10575       return Rotate;
10576
10577   // If we have direct support for blends, we should lower by decomposing into
10578   // a permute. That will be faster than the domain cross.
10579   if (IsBlendSupported)
10580     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10581                                                       Mask, DAG);
10582
10583   // We implement this with SHUFPD which is pretty lame because it will likely
10584   // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10585   // However, all the alternatives are still more cycles and newer chips don't
10586   // have this problem. It would be really nice if x86 had better shuffles here.
10587   V1 = DAG.getBitcast(MVT::v2f64, V1);
10588   V2 = DAG.getBitcast(MVT::v2f64, V2);
10589   return DAG.getBitcast(MVT::v2i64,
10590                         DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10591 }
10592
10593 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10594 ///
10595 /// This is used to disable more specialized lowerings when the shufps lowering
10596 /// will happen to be efficient.
10597 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10598   // This routine only handles 128-bit shufps.
10599   assert(Mask.size() == 4 && "Unsupported mask size!");
10600   assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10601   assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10602   assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10603   assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10604
10605   // To lower with a single SHUFPS we need to have the low half and high half
10606   // each requiring a single input.
10607   if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10608     return false;
10609   if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10610     return false;
10611
10612   return true;
10613 }
10614
10615 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10616 ///
10617 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10618 /// It makes no assumptions about whether this is the *best* lowering, it simply
10619 /// uses it.
10620 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10621                                             ArrayRef<int> Mask, SDValue V1,
10622                                             SDValue V2, SelectionDAG &DAG) {
10623   SDValue LowV = V1, HighV = V2;
10624   int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10625
10626   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10627
10628   if (NumV2Elements == 1) {
10629     int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10630
10631     // Compute the index adjacent to V2Index and in the same half by toggling
10632     // the low bit.
10633     int V2AdjIndex = V2Index ^ 1;
10634
10635     if (Mask[V2AdjIndex] < 0) {
10636       // Handles all the cases where we have a single V2 element and an undef.
10637       // This will only ever happen in the high lanes because we commute the
10638       // vector otherwise.
10639       if (V2Index < 2)
10640         std::swap(LowV, HighV);
10641       NewMask[V2Index] -= 4;
10642     } else {
10643       // Handle the case where the V2 element ends up adjacent to a V1 element.
10644       // To make this work, blend them together as the first step.
10645       int V1Index = V2AdjIndex;
10646       int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10647       V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10648                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10649
10650       // Now proceed to reconstruct the final blend as we have the necessary
10651       // high or low half formed.
10652       if (V2Index < 2) {
10653         LowV = V2;
10654         HighV = V1;
10655       } else {
10656         HighV = V2;
10657       }
10658       NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10659       NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10660     }
10661   } else if (NumV2Elements == 2) {
10662     if (Mask[0] < 4 && Mask[1] < 4) {
10663       // Handle the easy case where we have V1 in the low lanes and V2 in the
10664       // high lanes.
10665       NewMask[2] -= 4;
10666       NewMask[3] -= 4;
10667     } else if (Mask[2] < 4 && Mask[3] < 4) {
10668       // We also handle the reversed case because this utility may get called
10669       // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10670       // arrange things in the right direction.
10671       NewMask[0] -= 4;
10672       NewMask[1] -= 4;
10673       HighV = V1;
10674       LowV = V2;
10675     } else {
10676       // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10677       // trying to place elements directly, just blend them and set up the final
10678       // shuffle to place them.
10679
10680       // The first two blend mask elements are for V1, the second two are for
10681       // V2.
10682       int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10683                           Mask[2] < 4 ? Mask[2] : Mask[3],
10684                           (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10685                           (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10686       V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10687                        getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10688
10689       // Now we do a normal shuffle of V1 by giving V1 as both operands to
10690       // a blend.
10691       LowV = HighV = V1;
10692       NewMask[0] = Mask[0] < 4 ? 0 : 2;
10693       NewMask[1] = Mask[0] < 4 ? 2 : 0;
10694       NewMask[2] = Mask[2] < 4 ? 1 : 3;
10695       NewMask[3] = Mask[2] < 4 ? 3 : 1;
10696     }
10697   }
10698   return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10699                      getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10700 }
10701
10702 /// \brief Lower 4-lane 32-bit floating point shuffles.
10703 ///
10704 /// Uses instructions exclusively from the floating point unit to minimize
10705 /// domain crossing penalties, as these are sufficient to implement all v4f32
10706 /// shuffles.
10707 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10708                                        const APInt &Zeroable,
10709                                        SDValue V1, SDValue V2,
10710                                        const X86Subtarget &Subtarget,
10711                                        SelectionDAG &DAG) {
10712   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10713   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10714   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10715
10716   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10717
10718   if (NumV2Elements == 0) {
10719     // Check for being able to broadcast a single element.
10720     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10721             DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10722       return Broadcast;
10723
10724     // Use even/odd duplicate instructions for masks that match their pattern.
10725     if (Subtarget.hasSSE3()) {
10726       if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10727         return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10728       if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10729         return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10730     }
10731
10732     if (Subtarget.hasAVX()) {
10733       // If we have AVX, we can use VPERMILPS which will allow folding a load
10734       // into the shuffle.
10735       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10736                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10737     }
10738
10739     // Otherwise, use a straight shuffle of a single input vector. We pass the
10740     // input vector to both operands to simulate this with a SHUFPS.
10741     return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10742                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10743   }
10744
10745   // There are special ways we can lower some single-element blends. However, we
10746   // have custom ways we can lower more complex single-element blends below that
10747   // we defer to if both this and BLENDPS fail to match, so restrict this to
10748   // when the V2 input is targeting element 0 of the mask -- that is the fast
10749   // case here.
10750   if (NumV2Elements == 1 && Mask[0] >= 4)
10751     if (SDValue V = lowerVectorShuffleAsElementInsertion(
10752             DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10753       return V;
10754
10755   if (Subtarget.hasSSE41()) {
10756     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10757                                                   Zeroable, Subtarget, DAG))
10758       return Blend;
10759
10760     // Use INSERTPS if we can complete the shuffle efficiently.
10761     if (SDValue V =
10762             lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10763       return V;
10764
10765     if (!isSingleSHUFPSMask(Mask))
10766       if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10767               DL, MVT::v4f32, V1, V2, Mask, DAG))
10768         return BlendPerm;
10769   }
10770
10771   // Use low/high mov instructions.
10772   if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10773     return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10774   if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10775     return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10776
10777   // Use dedicated unpack instructions for masks that match their pattern.
10778   if (SDValue V =
10779           lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10780     return V;
10781
10782   // Otherwise fall back to a SHUFPS lowering strategy.
10783   return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10784 }
10785
10786 /// \brief Lower 4-lane i32 vector shuffles.
10787 ///
10788 /// We try to handle these with integer-domain shuffles where we can, but for
10789 /// blends we use the floating point domain blend instructions.
10790 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10791                                        const APInt &Zeroable,
10792                                        SDValue V1, SDValue V2,
10793                                        const X86Subtarget &Subtarget,
10794                                        SelectionDAG &DAG) {
10795   assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10796   assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10797   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10798
10799   // Whenever we can lower this as a zext, that instruction is strictly faster
10800   // than any alternative. It also allows us to fold memory operands into the
10801   // shuffle in many cases.
10802   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10803           DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10804     return ZExt;
10805
10806   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10807
10808   if (NumV2Elements == 0) {
10809     // Check for being able to broadcast a single element.
10810     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10811             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10812       return Broadcast;
10813
10814     // Straight shuffle of a single input vector. For everything from SSE2
10815     // onward this has a single fast instruction with no scary immediates.
10816     // We coerce the shuffle pattern to be compatible with UNPCK instructions
10817     // but we aren't actually going to use the UNPCK instruction because doing
10818     // so prevents folding a load into this instruction or making a copy.
10819     const int UnpackLoMask[] = {0, 0, 1, 1};
10820     const int UnpackHiMask[] = {2, 2, 3, 3};
10821     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10822       Mask = UnpackLoMask;
10823     else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10824       Mask = UnpackHiMask;
10825
10826     return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10827                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10828   }
10829
10830   // Try to use shift instructions.
10831   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10832                                                 Zeroable, Subtarget, DAG))
10833     return Shift;
10834
10835   // There are special ways we can lower some single-element blends.
10836   if (NumV2Elements == 1)
10837     if (SDValue V = lowerVectorShuffleAsElementInsertion(
10838             DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10839       return V;
10840
10841   // We have different paths for blend lowering, but they all must use the
10842   // *exact* same predicate.
10843   bool IsBlendSupported = Subtarget.hasSSE41();
10844   if (IsBlendSupported)
10845     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10846                                                   Zeroable, Subtarget, DAG))
10847       return Blend;
10848
10849   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10850                                                    Zeroable, DAG))
10851     return Masked;
10852
10853   // Use dedicated unpack instructions for masks that match their pattern.
10854   if (SDValue V =
10855           lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10856     return V;
10857
10858   // Try to use byte rotation instructions.
10859   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10860   if (Subtarget.hasSSSE3())
10861     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10862             DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10863       return Rotate;
10864
10865   // Assume that a single SHUFPS is faster than an alternative sequence of
10866   // multiple instructions (even if the CPU has a domain penalty).
10867   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10868   if (!isSingleSHUFPSMask(Mask)) {
10869     // If we have direct support for blends, we should lower by decomposing into
10870     // a permute. That will be faster than the domain cross.
10871     if (IsBlendSupported)
10872       return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10873                                                         Mask, DAG);
10874
10875     // Try to lower by permuting the inputs into an unpack instruction.
10876     if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10877             DL, MVT::v4i32, V1, V2, Mask, DAG))
10878       return Unpack;
10879   }
10880
10881   // We implement this with SHUFPS because it can blend from two vectors.
10882   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10883   // up the inputs, bypassing domain shift penalties that we would incur if we
10884   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10885   // relevant.
10886   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10887   SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10888   SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10889   return DAG.getBitcast(MVT::v4i32, ShufPS);
10890 }
10891
10892 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10893 /// shuffle lowering, and the most complex part.
10894 ///
10895 /// The lowering strategy is to try to form pairs of input lanes which are
10896 /// targeted at the same half of the final vector, and then use a dword shuffle
10897 /// to place them onto the right half, and finally unpack the paired lanes into
10898 /// their final position.
10899 ///
10900 /// The exact breakdown of how to form these dword pairs and align them on the
10901 /// correct sides is really tricky. See the comments within the function for
10902 /// more of the details.
10903 ///
10904 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10905 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10906 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10907 /// vector, form the analogous 128-bit 8-element Mask.
10908 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10909     const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10910     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10911   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10912   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10913
10914   assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10915   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10916   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10917
10918   SmallVector<int, 4> LoInputs;
10919   copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10920   std::sort(LoInputs.begin(), LoInputs.end());
10921   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10922   SmallVector<int, 4> HiInputs;
10923   copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10924   std::sort(HiInputs.begin(), HiInputs.end());
10925   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10926   int NumLToL =
10927       std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10928   int NumHToL = LoInputs.size() - NumLToL;
10929   int NumLToH =
10930       std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10931   int NumHToH = HiInputs.size() - NumLToH;
10932   MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10933   MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10934   MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10935   MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10936
10937   // If we are splatting two values from one half - one to each half, then
10938   // we can shuffle that half so each is splatted to a dword, then splat those
10939   // to their respective halves.
10940   auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10941                         int DOffset) {
10942     int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10943     int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10944     V = DAG.getNode(ShufWOp, DL, VT, V,
10945                     getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10946     V = DAG.getBitcast(PSHUFDVT, V);
10947     V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10948                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10949     return DAG.getBitcast(VT, V);
10950   };
10951
10952   if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10953     return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10954   if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10955     return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10956
10957   // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10958   // such inputs we can swap two of the dwords across the half mark and end up
10959   // with <=2 inputs to each half in each half. Once there, we can fall through
10960   // to the generic code below. For example:
10961   //
10962   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10963   // Mask:  [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10964   //
10965   // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10966   // and an existing 2-into-2 on the other half. In this case we may have to
10967   // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10968   // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10969   // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10970   // because any other situation (including a 3-into-1 or 1-into-3 in the other
10971   // half than the one we target for fixing) will be fixed when we re-enter this
10972   // path. We will also combine away any sequence of PSHUFD instructions that
10973   // result into a single instruction. Here is an example of the tricky case:
10974   //
10975   // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10976   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10977   //
10978   // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10979   //
10980   // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10981   // Mask:  [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10982   //
10983   // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10984   // Mask:  [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10985   //
10986   // The result is fine to be handled by the generic logic.
10987   auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10988                           ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10989                           int AOffset, int BOffset) {
10990     assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10991            "Must call this with A having 3 or 1 inputs from the A half.");
10992     assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10993            "Must call this with B having 1 or 3 inputs from the B half.");
10994     assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10995            "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10996
10997     bool ThreeAInputs = AToAInputs.size() == 3;
10998
10999     // Compute the index of dword with only one word among the three inputs in
11000     // a half by taking the sum of the half with three inputs and subtracting
11001     // the sum of the actual three inputs. The difference is the remaining
11002     // slot.
11003     int ADWord, BDWord;
11004     int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11005     int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11006     int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11007     ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11008     int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11009     int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11010     int TripleNonInputIdx =
11011         TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11012     TripleDWord = TripleNonInputIdx / 2;
11013
11014     // We use xor with one to compute the adjacent DWord to whichever one the
11015     // OneInput is in.
11016     OneInputDWord = (OneInput / 2) ^ 1;
11017
11018     // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11019     // and BToA inputs. If there is also such a problem with the BToB and AToB
11020     // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11021     // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11022     // is essential that we don't *create* a 3<-1 as then we might oscillate.
11023     if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11024       // Compute how many inputs will be flipped by swapping these DWords. We
11025       // need
11026       // to balance this to ensure we don't form a 3-1 shuffle in the other
11027       // half.
11028       int NumFlippedAToBInputs =
11029           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11030           std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11031       int NumFlippedBToBInputs =
11032           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11033           std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11034       if ((NumFlippedAToBInputs == 1 &&
11035            (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11036           (NumFlippedBToBInputs == 1 &&
11037            (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11038         // We choose whether to fix the A half or B half based on whether that
11039         // half has zero flipped inputs. At zero, we may not be able to fix it
11040         // with that half. We also bias towards fixing the B half because that
11041         // will more commonly be the high half, and we have to bias one way.
11042         auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11043                                                        ArrayRef<int> Inputs) {
11044           int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11045           bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11046           // Determine whether the free index is in the flipped dword or the
11047           // unflipped dword based on where the pinned index is. We use this bit
11048           // in an xor to conditionally select the adjacent dword.
11049           int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11050           bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11051           if (IsFixIdxInput == IsFixFreeIdxInput)
11052             FixFreeIdx += 1;
11053           IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11054           assert(IsFixIdxInput != IsFixFreeIdxInput &&
11055                  "We need to be changing the number of flipped inputs!");
11056           int PSHUFHalfMask[] = {0, 1, 2, 3};
11057           std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11058           V = DAG.getNode(
11059               FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11060               MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11061               getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11062
11063           for (int &M : Mask)
11064             if (M >= 0 && M == FixIdx)
11065               M = FixFreeIdx;
11066             else if (M >= 0 && M == FixFreeIdx)
11067               M = FixIdx;
11068         };
11069         if (NumFlippedBToBInputs != 0) {
11070           int BPinnedIdx =
11071               BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11072           FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11073         } else {
11074           assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11075           int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11076           FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11077         }
11078       }
11079     }
11080
11081     int PSHUFDMask[] = {0, 1, 2, 3};
11082     PSHUFDMask[ADWord] = BDWord;
11083     PSHUFDMask[BDWord] = ADWord;
11084     V = DAG.getBitcast(
11085         VT,
11086         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11087                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11088
11089     // Adjust the mask to match the new locations of A and B.
11090     for (int &M : Mask)
11091       if (M >= 0 && M/2 == ADWord)
11092         M = 2 * BDWord + M % 2;
11093       else if (M >= 0 && M/2 == BDWord)
11094         M = 2 * ADWord + M % 2;
11095
11096     // Recurse back into this routine to re-compute state now that this isn't
11097     // a 3 and 1 problem.
11098     return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11099                                                      DAG);
11100   };
11101   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11102     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11103   if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11104     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11105
11106   // At this point there are at most two inputs to the low and high halves from
11107   // each half. That means the inputs can always be grouped into dwords and
11108   // those dwords can then be moved to the correct half with a dword shuffle.
11109   // We use at most one low and one high word shuffle to collect these paired
11110   // inputs into dwords, and finally a dword shuffle to place them.
11111   int PSHUFLMask[4] = {-1, -1, -1, -1};
11112   int PSHUFHMask[4] = {-1, -1, -1, -1};
11113   int PSHUFDMask[4] = {-1, -1, -1, -1};
11114
11115   // First fix the masks for all the inputs that are staying in their
11116   // original halves. This will then dictate the targets of the cross-half
11117   // shuffles.
11118   auto fixInPlaceInputs =
11119       [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11120                     MutableArrayRef<int> SourceHalfMask,
11121                     MutableArrayRef<int> HalfMask, int HalfOffset) {
11122     if (InPlaceInputs.empty())
11123       return;
11124     if (InPlaceInputs.size() == 1) {
11125       SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11126           InPlaceInputs[0] - HalfOffset;
11127       PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11128       return;
11129     }
11130     if (IncomingInputs.empty()) {
11131       // Just fix all of the in place inputs.
11132       for (int Input : InPlaceInputs) {
11133         SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11134         PSHUFDMask[Input / 2] = Input / 2;
11135       }
11136       return;
11137     }
11138
11139     assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11140     SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11141         InPlaceInputs[0] - HalfOffset;
11142     // Put the second input next to the first so that they are packed into
11143     // a dword. We find the adjacent index by toggling the low bit.
11144     int AdjIndex = InPlaceInputs[0] ^ 1;
11145     SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11146     std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11147     PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11148   };
11149   fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11150   fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11151
11152   // Now gather the cross-half inputs and place them into a free dword of
11153   // their target half.
11154   // FIXME: This operation could almost certainly be simplified dramatically to
11155   // look more like the 3-1 fixing operation.
11156   auto moveInputsToRightHalf = [&PSHUFDMask](
11157       MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11158       MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11159       MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11160       int DestOffset) {
11161     auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11162       return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11163     };
11164     auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11165                                                int Word) {
11166       int LowWord = Word & ~1;
11167       int HighWord = Word | 1;
11168       return isWordClobbered(SourceHalfMask, LowWord) ||
11169              isWordClobbered(SourceHalfMask, HighWord);
11170     };
11171
11172     if (IncomingInputs.empty())
11173       return;
11174
11175     if (ExistingInputs.empty()) {
11176       // Map any dwords with inputs from them into the right half.
11177       for (int Input : IncomingInputs) {
11178         // If the source half mask maps over the inputs, turn those into
11179         // swaps and use the swapped lane.
11180         if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11181           if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11182             SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11183                 Input - SourceOffset;
11184             // We have to swap the uses in our half mask in one sweep.
11185             for (int &M : HalfMask)
11186               if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11187                 M = Input;
11188               else if (M == Input)
11189                 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11190           } else {
11191             assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11192                        Input - SourceOffset &&
11193                    "Previous placement doesn't match!");
11194           }
11195           // Note that this correctly re-maps both when we do a swap and when
11196           // we observe the other side of the swap above. We rely on that to
11197           // avoid swapping the members of the input list directly.
11198           Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11199         }
11200
11201         // Map the input's dword into the correct half.
11202         if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11203           PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11204         else
11205           assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11206                      Input / 2 &&
11207                  "Previous placement doesn't match!");
11208       }
11209
11210       // And just directly shift any other-half mask elements to be same-half
11211       // as we will have mirrored the dword containing the element into the
11212       // same position within that half.
11213       for (int &M : HalfMask)
11214         if (M >= SourceOffset && M < SourceOffset + 4) {
11215           M = M - SourceOffset + DestOffset;
11216           assert(M >= 0 && "This should never wrap below zero!");
11217         }
11218       return;
11219     }
11220
11221     // Ensure we have the input in a viable dword of its current half. This
11222     // is particularly tricky because the original position may be clobbered
11223     // by inputs being moved and *staying* in that half.
11224     if (IncomingInputs.size() == 1) {
11225       if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11226         int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11227                          SourceOffset;
11228         SourceHalfMask[InputFixed - SourceOffset] =
11229             IncomingInputs[0] - SourceOffset;
11230         std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11231                      InputFixed);
11232         IncomingInputs[0] = InputFixed;
11233       }
11234     } else if (IncomingInputs.size() == 2) {
11235       if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11236           isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11237         // We have two non-adjacent or clobbered inputs we need to extract from
11238         // the source half. To do this, we need to map them into some adjacent
11239         // dword slot in the source mask.
11240         int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11241                               IncomingInputs[1] - SourceOffset};
11242
11243         // If there is a free slot in the source half mask adjacent to one of
11244         // the inputs, place the other input in it. We use (Index XOR 1) to
11245         // compute an adjacent index.
11246         if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11247             SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11248           SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11249           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11250           InputsFixed[1] = InputsFixed[0] ^ 1;
11251         } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11252                    SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11253           SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11254           SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11255           InputsFixed[0] = InputsFixed[1] ^ 1;
11256         } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11257                    SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11258           // The two inputs are in the same DWord but it is clobbered and the
11259           // adjacent DWord isn't used at all. Move both inputs to the free
11260           // slot.
11261           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11262           SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11263           InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11264           InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11265         } else {
11266           // The only way we hit this point is if there is no clobbering
11267           // (because there are no off-half inputs to this half) and there is no
11268           // free slot adjacent to one of the inputs. In this case, we have to
11269           // swap an input with a non-input.
11270           for (int i = 0; i < 4; ++i)
11271             assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11272                    "We can't handle any clobbers here!");
11273           assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11274                  "Cannot have adjacent inputs here!");
11275
11276           SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11277           SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11278
11279           // We also have to update the final source mask in this case because
11280           // it may need to undo the above swap.
11281           for (int &M : FinalSourceHalfMask)
11282             if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11283               M = InputsFixed[1] + SourceOffset;
11284             else if (M == InputsFixed[1] + SourceOffset)
11285               M = (InputsFixed[0] ^ 1) + SourceOffset;
11286
11287           InputsFixed[1] = InputsFixed[0] ^ 1;
11288         }
11289
11290         // Point everything at the fixed inputs.
11291         for (int &M : HalfMask)
11292           if (M == IncomingInputs[0])
11293             M = InputsFixed[0] + SourceOffset;
11294           else if (M == IncomingInputs[1])
11295             M = InputsFixed[1] + SourceOffset;
11296
11297         IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11298         IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11299       }
11300     } else {
11301       llvm_unreachable("Unhandled input size!");
11302     }
11303
11304     // Now hoist the DWord down to the right half.
11305     int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11306     assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11307     PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11308     for (int &M : HalfMask)
11309       for (int Input : IncomingInputs)
11310         if (M == Input)
11311           M = FreeDWord * 2 + Input % 2;
11312   };
11313   moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11314                         /*SourceOffset*/ 4, /*DestOffset*/ 0);
11315   moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11316                         /*SourceOffset*/ 0, /*DestOffset*/ 4);
11317
11318   // Now enact all the shuffles we've computed to move the inputs into their
11319   // target half.
11320   if (!isNoopShuffleMask(PSHUFLMask))
11321     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11322                     getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11323   if (!isNoopShuffleMask(PSHUFHMask))
11324     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11325                     getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11326   if (!isNoopShuffleMask(PSHUFDMask))
11327     V = DAG.getBitcast(
11328         VT,
11329         DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11330                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11331
11332   // At this point, each half should contain all its inputs, and we can then
11333   // just shuffle them into their final position.
11334   assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11335          "Failed to lift all the high half inputs to the low mask!");
11336   assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11337          "Failed to lift all the low half inputs to the high mask!");
11338
11339   // Do a half shuffle for the low mask.
11340   if (!isNoopShuffleMask(LoMask))
11341     V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11342                     getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11343
11344   // Do a half shuffle with the high mask after shifting its values down.
11345   for (int &M : HiMask)
11346     if (M >= 0)
11347       M -= 4;
11348   if (!isNoopShuffleMask(HiMask))
11349     V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11350                     getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11351
11352   return V;
11353 }
11354
11355 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11356 /// blend if only one input is used.
11357 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11358     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11359     const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11360     bool &V2InUse) {
11361   SDValue V1Mask[16];
11362   SDValue V2Mask[16];
11363   V1InUse = false;
11364   V2InUse = false;
11365
11366   int Size = Mask.size();
11367   int Scale = 16 / Size;
11368   for (int i = 0; i < 16; ++i) {
11369     if (Mask[i / Scale] < 0) {
11370       V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11371     } else {
11372       const int ZeroMask = 0x80;
11373       int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11374                                           : ZeroMask;
11375       int V2Idx = Mask[i / Scale] < Size
11376                       ? ZeroMask
11377                       : (Mask[i / Scale] - Size) * Scale + i % Scale;
11378       if (Zeroable[i / Scale])
11379         V1Idx = V2Idx = ZeroMask;
11380       V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11381       V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11382       V1InUse |= (ZeroMask != V1Idx);
11383       V2InUse |= (ZeroMask != V2Idx);
11384     }
11385   }
11386
11387   if (V1InUse)
11388     V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11389                      DAG.getBitcast(MVT::v16i8, V1),
11390                      DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11391   if (V2InUse)
11392     V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11393                      DAG.getBitcast(MVT::v16i8, V2),
11394                      DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11395
11396   // If we need shuffled inputs from both, blend the two.
11397   SDValue V;
11398   if (V1InUse && V2InUse)
11399     V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11400   else
11401     V = V1InUse ? V1 : V2;
11402
11403   // Cast the result back to the correct type.
11404   return DAG.getBitcast(VT, V);
11405 }
11406
11407 /// \brief Generic lowering of 8-lane i16 shuffles.
11408 ///
11409 /// This handles both single-input shuffles and combined shuffle/blends with
11410 /// two inputs. The single input shuffles are immediately delegated to
11411 /// a dedicated lowering routine.
11412 ///
11413 /// The blends are lowered in one of three fundamental ways. If there are few
11414 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11415 /// of the input is significantly cheaper when lowered as an interleaving of
11416 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11417 /// halves of the inputs separately (making them have relatively few inputs)
11418 /// and then concatenate them.
11419 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11420                                        const APInt &Zeroable,
11421                                        SDValue V1, SDValue V2,
11422                                        const X86Subtarget &Subtarget,
11423                                        SelectionDAG &DAG) {
11424   assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11425   assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11426   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11427
11428   // Whenever we can lower this as a zext, that instruction is strictly faster
11429   // than any alternative.
11430   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11431           DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11432     return ZExt;
11433
11434   int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11435
11436   if (NumV2Inputs == 0) {
11437     // Check for being able to broadcast a single element.
11438     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11439             DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11440       return Broadcast;
11441
11442     // Try to use shift instructions.
11443     if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11444                                                   Zeroable, Subtarget, DAG))
11445       return Shift;
11446
11447     // Use dedicated unpack instructions for masks that match their pattern.
11448     if (SDValue V =
11449             lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11450       return V;
11451
11452     // Try to use byte rotation instructions.
11453     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11454                                                         Mask, Subtarget, DAG))
11455       return Rotate;
11456
11457     // Make a copy of the mask so it can be modified.
11458     SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11459     return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11460                                                      MutableMask, Subtarget,
11461                                                      DAG);
11462   }
11463
11464   assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11465          "All single-input shuffles should be canonicalized to be V1-input "
11466          "shuffles.");
11467
11468   // Try to use shift instructions.
11469   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11470                                                 Zeroable, Subtarget, DAG))
11471     return Shift;
11472
11473   // See if we can use SSE4A Extraction / Insertion.
11474   if (Subtarget.hasSSE4A())
11475     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11476                                                 Zeroable, DAG))
11477       return V;
11478
11479   // There are special ways we can lower some single-element blends.
11480   if (NumV2Inputs == 1)
11481     if (SDValue V = lowerVectorShuffleAsElementInsertion(
11482             DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11483       return V;
11484
11485   // We have different paths for blend lowering, but they all must use the
11486   // *exact* same predicate.
11487   bool IsBlendSupported = Subtarget.hasSSE41();
11488   if (IsBlendSupported)
11489     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11490                                                   Zeroable, Subtarget, DAG))
11491       return Blend;
11492
11493   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11494                                                    Zeroable, DAG))
11495     return Masked;
11496
11497   // Use dedicated unpack instructions for masks that match their pattern.
11498   if (SDValue V =
11499           lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11500     return V;
11501
11502   // Try to use byte rotation instructions.
11503   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11504           DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11505     return Rotate;
11506
11507   if (SDValue BitBlend =
11508           lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11509     return BitBlend;
11510
11511   // Try to lower by permuting the inputs into an unpack instruction.
11512   if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11513                                                             V2, Mask, DAG))
11514     return Unpack;
11515
11516   // If we can't directly blend but can use PSHUFB, that will be better as it
11517   // can both shuffle and set up the inefficient blend.
11518   if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11519     bool V1InUse, V2InUse;
11520     return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11521                                               Zeroable, DAG, V1InUse, V2InUse);
11522   }
11523
11524   // We can always bit-blend if we have to so the fallback strategy is to
11525   // decompose into single-input permutes and blends.
11526   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11527                                                     Mask, DAG);
11528 }
11529
11530 /// \brief Check whether a compaction lowering can be done by dropping even
11531 /// elements and compute how many times even elements must be dropped.
11532 ///
11533 /// This handles shuffles which take every Nth element where N is a power of
11534 /// two. Example shuffle masks:
11535 ///
11536 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
11537 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11538 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
11539 ///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
11540 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
11541 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
11542 ///
11543 /// Any of these lanes can of course be undef.
11544 ///
11545 /// This routine only supports N <= 3.
11546 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11547 /// for larger N.
11548 ///
11549 /// \returns N above, or the number of times even elements must be dropped if
11550 /// there is such a number. Otherwise returns zero.
11551 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11552                                           bool IsSingleInput) {
11553   // The modulus for the shuffle vector entries is based on whether this is
11554   // a single input or not.
11555   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11556   assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11557          "We should only be called with masks with a power-of-2 size!");
11558
11559   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11560
11561   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11562   // and 2^3 simultaneously. This is because we may have ambiguity with
11563   // partially undef inputs.
11564   bool ViableForN[3] = {true, true, true};
11565
11566   for (int i = 0, e = Mask.size(); i < e; ++i) {
11567     // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11568     // want.
11569     if (Mask[i] < 0)
11570       continue;
11571
11572     bool IsAnyViable = false;
11573     for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11574       if (ViableForN[j]) {
11575         uint64_t N = j + 1;
11576
11577         // The shuffle mask must be equal to (i * 2^N) % M.
11578         if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11579           IsAnyViable = true;
11580         else
11581           ViableForN[j] = false;
11582       }
11583     // Early exit if we exhaust the possible powers of two.
11584     if (!IsAnyViable)
11585       break;
11586   }
11587
11588   for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11589     if (ViableForN[j])
11590       return j + 1;
11591
11592   // Return 0 as there is no viable power of two.
11593   return 0;
11594 }
11595
11596 /// \brief Generic lowering of v16i8 shuffles.
11597 ///
11598 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11599 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11600 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11601 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11602 /// back together.
11603 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11604                                        const APInt &Zeroable,
11605                                        SDValue V1, SDValue V2,
11606                                        const X86Subtarget &Subtarget,
11607                                        SelectionDAG &DAG) {
11608   assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11609   assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11610   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11611
11612   // Try to use shift instructions.
11613   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11614                                                 Zeroable, Subtarget, DAG))
11615     return Shift;
11616
11617   // Try to use byte rotation instructions.
11618   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11619           DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11620     return Rotate;
11621
11622   // Try to use a zext lowering.
11623   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11624           DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11625     return ZExt;
11626
11627   // See if we can use SSE4A Extraction / Insertion.
11628   if (Subtarget.hasSSE4A())
11629     if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11630                                                 Zeroable, DAG))
11631       return V;
11632
11633   int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11634
11635   // For single-input shuffles, there are some nicer lowering tricks we can use.
11636   if (NumV2Elements == 0) {
11637     // Check for being able to broadcast a single element.
11638     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11639             DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11640       return Broadcast;
11641
11642     // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11643     // Notably, this handles splat and partial-splat shuffles more efficiently.
11644     // However, it only makes sense if the pre-duplication shuffle simplifies
11645     // things significantly. Currently, this means we need to be able to
11646     // express the pre-duplication shuffle as an i16 shuffle.
11647     //
11648     // FIXME: We should check for other patterns which can be widened into an
11649     // i16 shuffle as well.
11650     auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11651       for (int i = 0; i < 16; i += 2)
11652         if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11653           return false;
11654
11655       return true;
11656     };
11657     auto tryToWidenViaDuplication = [&]() -> SDValue {
11658       if (!canWidenViaDuplication(Mask))
11659         return SDValue();
11660       SmallVector<int, 4> LoInputs;
11661       copy_if(Mask, std::back_inserter(LoInputs),
11662               [](int M) { return M >= 0 && M < 8; });
11663       std::sort(LoInputs.begin(), LoInputs.end());
11664       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11665                      LoInputs.end());
11666       SmallVector<int, 4> HiInputs;
11667       copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11668       std::sort(HiInputs.begin(), HiInputs.end());
11669       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11670                      HiInputs.end());
11671
11672       bool TargetLo = LoInputs.size() >= HiInputs.size();
11673       ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11674       ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11675
11676       int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11677       SmallDenseMap<int, int, 8> LaneMap;
11678       for (int I : InPlaceInputs) {
11679         PreDupI16Shuffle[I/2] = I/2;
11680         LaneMap[I] = I;
11681       }
11682       int j = TargetLo ? 0 : 4, je = j + 4;
11683       for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11684         // Check if j is already a shuffle of this input. This happens when
11685         // there are two adjacent bytes after we move the low one.
11686         if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11687           // If we haven't yet mapped the input, search for a slot into which
11688           // we can map it.
11689           while (j < je && PreDupI16Shuffle[j] >= 0)
11690             ++j;
11691
11692           if (j == je)
11693             // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11694             return SDValue();
11695
11696           // Map this input with the i16 shuffle.
11697           PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11698         }
11699
11700         // Update the lane map based on the mapping we ended up with.
11701         LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11702       }
11703       V1 = DAG.getBitcast(
11704           MVT::v16i8,
11705           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11706                                DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11707
11708       // Unpack the bytes to form the i16s that will be shuffled into place.
11709       V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11710                        MVT::v16i8, V1, V1);
11711
11712       int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11713       for (int i = 0; i < 16; ++i)
11714         if (Mask[i] >= 0) {
11715           int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11716           assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11717           if (PostDupI16Shuffle[i / 2] < 0)
11718             PostDupI16Shuffle[i / 2] = MappedMask;
11719           else
11720             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11721                    "Conflicting entries in the original shuffle!");
11722         }
11723       return DAG.getBitcast(
11724           MVT::v16i8,
11725           DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11726                                DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11727     };
11728     if (SDValue V = tryToWidenViaDuplication())
11729       return V;
11730   }
11731
11732   if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11733                                                    Zeroable, DAG))
11734     return Masked;
11735
11736   // Use dedicated unpack instructions for masks that match their pattern.
11737   if (SDValue V =
11738           lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11739     return V;
11740
11741   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11742   // with PSHUFB. It is important to do this before we attempt to generate any
11743   // blends but after all of the single-input lowerings. If the single input
11744   // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11745   // want to preserve that and we can DAG combine any longer sequences into
11746   // a PSHUFB in the end. But once we start blending from multiple inputs,
11747   // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11748   // and there are *very* few patterns that would actually be faster than the
11749   // PSHUFB approach because of its ability to zero lanes.
11750   //
11751   // FIXME: The only exceptions to the above are blends which are exact
11752   // interleavings with direct instructions supporting them. We currently don't
11753   // handle those well here.
11754   if (Subtarget.hasSSSE3()) {
11755     bool V1InUse = false;
11756     bool V2InUse = false;
11757
11758     SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11759         DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11760
11761     // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11762     // do so. This avoids using them to handle blends-with-zero which is
11763     // important as a single pshufb is significantly faster for that.
11764     if (V1InUse && V2InUse) {
11765       if (Subtarget.hasSSE41())
11766         if (SDValue Blend = lowerVectorShuffleAsBlend(
11767                 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11768           return Blend;
11769
11770       // We can use an unpack to do the blending rather than an or in some
11771       // cases. Even though the or may be (very minorly) more efficient, we
11772       // preference this lowering because there are common cases where part of
11773       // the complexity of the shuffles goes away when we do the final blend as
11774       // an unpack.
11775       // FIXME: It might be worth trying to detect if the unpack-feeding
11776       // shuffles will both be pshufb, in which case we shouldn't bother with
11777       // this.
11778       if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11779               DL, MVT::v16i8, V1, V2, Mask, DAG))
11780         return Unpack;
11781     }
11782
11783     return PSHUFB;
11784   }
11785
11786   // There are special ways we can lower some single-element blends.
11787   if (NumV2Elements == 1)
11788     if (SDValue V = lowerVectorShuffleAsElementInsertion(
11789             DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11790       return V;
11791
11792   if (SDValue BitBlend =
11793           lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11794     return BitBlend;
11795
11796   // Check whether a compaction lowering can be done. This handles shuffles
11797   // which take every Nth element for some even N. See the helper function for
11798   // details.
11799   //
11800   // We special case these as they can be particularly efficiently handled with
11801   // the PACKUSB instruction on x86 and they show up in common patterns of
11802   // rearranging bytes to truncate wide elements.
11803   bool IsSingleInput = V2.isUndef();
11804   if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11805     // NumEvenDrops is the power of two stride of the elements. Another way of
11806     // thinking about it is that we need to drop the even elements this many
11807     // times to get the original input.
11808
11809     // First we need to zero all the dropped bytes.
11810     assert(NumEvenDrops <= 3 &&
11811            "No support for dropping even elements more than 3 times.");
11812     // We use the mask type to pick which bytes are preserved based on how many
11813     // elements are dropped.
11814     MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11815     SDValue ByteClearMask = DAG.getBitcast(
11816         MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11817     V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11818     if (!IsSingleInput)
11819       V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11820
11821     // Now pack things back together.
11822     V1 = DAG.getBitcast(MVT::v8i16, V1);
11823     V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11824     SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11825     for (int i = 1; i < NumEvenDrops; ++i) {
11826       Result = DAG.getBitcast(MVT::v8i16, Result);
11827       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11828     }
11829
11830     return Result;
11831   }
11832
11833   // Handle multi-input cases by blending single-input shuffles.
11834   if (NumV2Elements > 0)
11835     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11836                                                       Mask, DAG);
11837
11838   // The fallback path for single-input shuffles widens this into two v8i16
11839   // vectors with unpacks, shuffles those, and then pulls them back together
11840   // with a pack.
11841   SDValue V = V1;
11842
11843   std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11844   std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11845   for (int i = 0; i < 16; ++i)
11846     if (Mask[i] >= 0)
11847       (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11848
11849   SDValue VLoHalf, VHiHalf;
11850   // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11851   // them out and avoid using UNPCK{L,H} to extract the elements of V as
11852   // i16s.
11853   if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11854       none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11855     // Use a mask to drop the high bytes.
11856     VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11857     VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11858                           DAG.getConstant(0x00FF, DL, MVT::v8i16));
11859
11860     // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11861     VHiHalf = DAG.getUNDEF(MVT::v8i16);
11862
11863     // Squash the masks to point directly into VLoHalf.
11864     for (int &M : LoBlendMask)
11865       if (M >= 0)
11866         M /= 2;
11867     for (int &M : HiBlendMask)
11868       if (M >= 0)
11869         M /= 2;
11870   } else {
11871     // Otherwise just unpack the low half of V into VLoHalf and the high half into
11872     // VHiHalf so that we can blend them as i16s.
11873     SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11874
11875     VLoHalf = DAG.getBitcast(
11876         MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11877     VHiHalf = DAG.getBitcast(
11878         MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11879   }
11880
11881   SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11882   SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11883
11884   return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11885 }
11886
11887 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11888 ///
11889 /// This routine breaks down the specific type of 128-bit shuffle and
11890 /// dispatches to the lowering routines accordingly.
11891 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11892                                         MVT VT, SDValue V1, SDValue V2,
11893                                         const APInt &Zeroable,
11894                                         const X86Subtarget &Subtarget,
11895                                         SelectionDAG &DAG) {
11896   switch (VT.SimpleTy) {
11897   case MVT::v2i64:
11898     return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11899   case MVT::v2f64:
11900     return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11901   case MVT::v4i32:
11902     return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11903   case MVT::v4f32:
11904     return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11905   case MVT::v8i16:
11906     return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11907   case MVT::v16i8:
11908     return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11909
11910   default:
11911     llvm_unreachable("Unimplemented!");
11912   }
11913 }
11914
11915 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11916 ///
11917 /// This routine just extracts two subvectors, shuffles them independently, and
11918 /// then concatenates them back together. This should work effectively with all
11919 /// AVX vector shuffle types.
11920 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11921                                           SDValue V2, ArrayRef<int> Mask,
11922                                           SelectionDAG &DAG) {
11923   assert(VT.getSizeInBits() >= 256 &&
11924          "Only for 256-bit or wider vector shuffles!");
11925   assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11926   assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11927
11928   ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11929   ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11930
11931   int NumElements = VT.getVectorNumElements();
11932   int SplitNumElements = NumElements / 2;
11933   MVT ScalarVT = VT.getVectorElementType();
11934   MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11935
11936   // Rather than splitting build-vectors, just build two narrower build
11937   // vectors. This helps shuffling with splats and zeros.
11938   auto SplitVector = [&](SDValue V) {
11939     V = peekThroughBitcasts(V);
11940
11941     MVT OrigVT = V.getSimpleValueType();
11942     int OrigNumElements = OrigVT.getVectorNumElements();
11943     int OrigSplitNumElements = OrigNumElements / 2;
11944     MVT OrigScalarVT = OrigVT.getVectorElementType();
11945     MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11946
11947     SDValue LoV, HiV;
11948
11949     auto *BV = dyn_cast<BuildVectorSDNode>(V);
11950     if (!BV) {
11951       LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11952                         DAG.getIntPtrConstant(0, DL));
11953       HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11954                         DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11955     } else {
11956
11957       SmallVector<SDValue, 16> LoOps, HiOps;
11958       for (int i = 0; i < OrigSplitNumElements; ++i) {
11959         LoOps.push_back(BV->getOperand(i));
11960         HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11961       }
11962       LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11963       HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11964     }
11965     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11966                           DAG.getBitcast(SplitVT, HiV));
11967   };
11968
11969   SDValue LoV1, HiV1, LoV2, HiV2;
11970   std::tie(LoV1, HiV1) = SplitVector(V1);
11971   std::tie(LoV2, HiV2) = SplitVector(V2);
11972
11973   // Now create two 4-way blends of these half-width vectors.
11974   auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11975     bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11976     SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11977     SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11978     SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11979     for (int i = 0; i < SplitNumElements; ++i) {
11980       int M = HalfMask[i];
11981       if (M >= NumElements) {
11982         if (M >= NumElements + SplitNumElements)
11983           UseHiV2 = true;
11984         else
11985           UseLoV2 = true;
11986         V2BlendMask[i] = M - NumElements;
11987         BlendMask[i] = SplitNumElements + i;
11988       } else if (M >= 0) {
11989         if (M >= SplitNumElements)
11990           UseHiV1 = true;
11991         else
11992           UseLoV1 = true;
11993         V1BlendMask[i] = M;
11994         BlendMask[i] = i;
11995       }
11996     }
11997
11998     // Because the lowering happens after all combining takes place, we need to
11999     // manually combine these blend masks as much as possible so that we create
12000     // a minimal number of high-level vector shuffle nodes.
12001
12002     // First try just blending the halves of V1 or V2.
12003     if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12004       return DAG.getUNDEF(SplitVT);
12005     if (!UseLoV2 && !UseHiV2)
12006       return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12007     if (!UseLoV1 && !UseHiV1)
12008       return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12009
12010     SDValue V1Blend, V2Blend;
12011     if (UseLoV1 && UseHiV1) {
12012       V1Blend =
12013         DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12014     } else {
12015       // We only use half of V1 so map the usage down into the final blend mask.
12016       V1Blend = UseLoV1 ? LoV1 : HiV1;
12017       for (int i = 0; i < SplitNumElements; ++i)
12018         if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12019           BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12020     }
12021     if (UseLoV2 && UseHiV2) {
12022       V2Blend =
12023         DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12024     } else {
12025       // We only use half of V2 so map the usage down into the final blend mask.
12026       V2Blend = UseLoV2 ? LoV2 : HiV2;
12027       for (int i = 0; i < SplitNumElements; ++i)
12028         if (BlendMask[i] >= SplitNumElements)
12029           BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12030     }
12031     return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12032   };
12033   SDValue Lo = HalfBlend(LoMask);
12034   SDValue Hi = HalfBlend(HiMask);
12035   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12036 }
12037
12038 /// \brief Either split a vector in halves or decompose the shuffles and the
12039 /// blend.
12040 ///
12041 /// This is provided as a good fallback for many lowerings of non-single-input
12042 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12043 /// between splitting the shuffle into 128-bit components and stitching those
12044 /// back together vs. extracting the single-input shuffles and blending those
12045 /// results.
12046 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12047                                                 SDValue V1, SDValue V2,
12048                                                 ArrayRef<int> Mask,
12049                                                 SelectionDAG &DAG) {
12050   assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12051          "shuffles as it could then recurse on itself.");
12052   int Size = Mask.size();
12053
12054   // If this can be modeled as a broadcast of two elements followed by a blend,
12055   // prefer that lowering. This is especially important because broadcasts can
12056   // often fold with memory operands.
12057   auto DoBothBroadcast = [&] {
12058     int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12059     for (int M : Mask)
12060       if (M >= Size) {
12061         if (V2BroadcastIdx < 0)
12062           V2BroadcastIdx = M - Size;
12063         else if (M - Size != V2BroadcastIdx)
12064           return false;
12065       } else if (M >= 0) {
12066         if (V1BroadcastIdx < 0)
12067           V1BroadcastIdx = M;
12068         else if (M != V1BroadcastIdx)
12069           return false;
12070       }
12071     return true;
12072   };
12073   if (DoBothBroadcast())
12074     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12075                                                       DAG);
12076
12077   // If the inputs all stem from a single 128-bit lane of each input, then we
12078   // split them rather than blending because the split will decompose to
12079   // unusually few instructions.
12080   int LaneCount = VT.getSizeInBits() / 128;
12081   int LaneSize = Size / LaneCount;
12082   SmallBitVector LaneInputs[2];
12083   LaneInputs[0].resize(LaneCount, false);
12084   LaneInputs[1].resize(LaneCount, false);
12085   for (int i = 0; i < Size; ++i)
12086     if (Mask[i] >= 0)
12087       LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12088   if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12089     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12090
12091   // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12092   // that the decomposed single-input shuffles don't end up here.
12093   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12094 }
12095
12096 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12097 /// a permutation and blend of those lanes.
12098 ///
12099 /// This essentially blends the out-of-lane inputs to each lane into the lane
12100 /// from a permuted copy of the vector. This lowering strategy results in four
12101 /// instructions in the worst case for a single-input cross lane shuffle which
12102 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12103 /// of. Special cases for each particular shuffle pattern should be handled
12104 /// prior to trying this lowering.
12105 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12106                                                        SDValue V1, SDValue V2,
12107                                                        ArrayRef<int> Mask,
12108                                                        SelectionDAG &DAG) {
12109   // FIXME: This should probably be generalized for 512-bit vectors as well.
12110   assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12111   int Size = Mask.size();
12112   int LaneSize = Size / 2;
12113
12114   // If there are only inputs from one 128-bit lane, splitting will in fact be
12115   // less expensive. The flags track whether the given lane contains an element
12116   // that crosses to another lane.
12117   bool LaneCrossing[2] = {false, false};
12118   for (int i = 0; i < Size; ++i)
12119     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12120       LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12121   if (!LaneCrossing[0] || !LaneCrossing[1])
12122     return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12123
12124   assert(V2.isUndef() &&
12125          "This last part of this routine only works on single input shuffles");
12126
12127   SmallVector<int, 32> FlippedBlendMask(Size);
12128   for (int i = 0; i < Size; ++i)
12129     FlippedBlendMask[i] =
12130         Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12131                                 ? Mask[i]
12132                                 : Mask[i] % LaneSize +
12133                                       (i / LaneSize) * LaneSize + Size);
12134
12135   // Flip the vector, and blend the results which should now be in-lane. The
12136   // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
12137   // 5 for the high source. The value 3 selects the high half of source 2 and
12138   // the value 2 selects the low half of source 2. We only use source 2 to
12139   // allow folding it into a memory operand.
12140   unsigned PERMMask = 3 | 2 << 4;
12141   SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12142                                 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12143   return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12144 }
12145
12146 /// \brief Handle lowering 2-lane 128-bit shuffles.
12147 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12148                                         SDValue V2, ArrayRef<int> Mask,
12149                                         const APInt &Zeroable,
12150                                         const X86Subtarget &Subtarget,
12151                                         SelectionDAG &DAG) {
12152   SmallVector<int, 4> WidenedMask;
12153   if (!canWidenShuffleElements(Mask, WidenedMask))
12154     return SDValue();
12155
12156   // TODO: If minimizing size and one of the inputs is a zero vector and the
12157   // the zero vector has only one use, we could use a VPERM2X128 to save the
12158   // instruction bytes needed to explicitly generate the zero vector.
12159
12160   // Blends are faster and handle all the non-lane-crossing cases.
12161   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12162                                                 Zeroable, Subtarget, DAG))
12163     return Blend;
12164
12165   bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12166   bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12167
12168   // If either input operand is a zero vector, use VPERM2X128 because its mask
12169   // allows us to replace the zero input with an implicit zero.
12170   if (!IsV1Zero && !IsV2Zero) {
12171     // Check for patterns which can be matched with a single insert of a 128-bit
12172     // subvector.
12173     bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12174     if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12175       // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
12176       if (Subtarget.hasAVX2() && V2.isUndef())
12177         return SDValue();
12178
12179       // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12180       // this will likely become vinsertf128 which can't fold a 256-bit memop.
12181       if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12182         MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12183                                      VT.getVectorNumElements() / 2);
12184         SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12185                                   DAG.getIntPtrConstant(0, DL));
12186         SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12187                                   OnlyUsesV1 ? V1 : V2,
12188                                   DAG.getIntPtrConstant(0, DL));
12189         return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12190       }
12191     }
12192   }
12193
12194   // Otherwise form a 128-bit permutation. After accounting for undefs,
12195   // convert the 64-bit shuffle mask selection values into 128-bit
12196   // selection bits by dividing the indexes by 2 and shifting into positions
12197   // defined by a vperm2*128 instruction's immediate control byte.
12198
12199   // The immediate permute control byte looks like this:
12200   //    [1:0] - select 128 bits from sources for low half of destination
12201   //    [2]   - ignore
12202   //    [3]   - zero low half of destination
12203   //    [5:4] - select 128 bits from sources for high half of destination
12204   //    [6]   - ignore
12205   //    [7]   - zero high half of destination
12206
12207   int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12208   int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12209
12210   unsigned PermMask = MaskLO | (MaskHI << 4);
12211
12212   // If either input is a zero vector, replace it with an undef input.
12213   // Shuffle mask values <  4 are selecting elements of V1.
12214   // Shuffle mask values >= 4 are selecting elements of V2.
12215   // Adjust each half of the permute mask by clearing the half that was
12216   // selecting the zero vector and setting the zero mask bit.
12217   if (IsV1Zero) {
12218     V1 = DAG.getUNDEF(VT);
12219     if (MaskLO < 2)
12220       PermMask = (PermMask & 0xf0) | 0x08;
12221     if (MaskHI < 2)
12222       PermMask = (PermMask & 0x0f) | 0x80;
12223   }
12224   if (IsV2Zero) {
12225     V2 = DAG.getUNDEF(VT);
12226     if (MaskLO >= 2)
12227       PermMask = (PermMask & 0xf0) | 0x08;
12228     if (MaskHI >= 2)
12229       PermMask = (PermMask & 0x0f) | 0x80;
12230   }
12231
12232   return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12233                      DAG.getConstant(PermMask, DL, MVT::i8));
12234 }
12235
12236 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12237 /// shuffling each lane.
12238 ///
12239 /// This will only succeed when the result of fixing the 128-bit lanes results
12240 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12241 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12242 /// the lane crosses early and then use simpler shuffles within each lane.
12243 ///
12244 /// FIXME: It might be worthwhile at some point to support this without
12245 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12246 /// in x86 only floating point has interesting non-repeating shuffles, and even
12247 /// those are still *marginally* more expensive.
12248 static SDValue lowerVectorShuffleByMerging128BitLanes(
12249     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12250     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12251   assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12252
12253   int Size = Mask.size();
12254   int LaneSize = 128 / VT.getScalarSizeInBits();
12255   int NumLanes = Size / LaneSize;
12256   assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12257
12258   // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12259   // check whether the in-128-bit lane shuffles share a repeating pattern.
12260   SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12261   SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12262   for (int i = 0; i < Size; ++i) {
12263     if (Mask[i] < 0)
12264       continue;
12265
12266     int j = i / LaneSize;
12267
12268     if (Lanes[j] < 0) {
12269       // First entry we've seen for this lane.
12270       Lanes[j] = Mask[i] / LaneSize;
12271     } else if (Lanes[j] != Mask[i] / LaneSize) {
12272       // This doesn't match the lane selected previously!
12273       return SDValue();
12274     }
12275
12276     // Check that within each lane we have a consistent shuffle mask.
12277     int k = i % LaneSize;
12278     if (InLaneMask[k] < 0) {
12279       InLaneMask[k] = Mask[i] % LaneSize;
12280     } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12281       // This doesn't fit a repeating in-lane mask.
12282       return SDValue();
12283     }
12284   }
12285
12286   // First shuffle the lanes into place.
12287   MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12288                                 VT.getSizeInBits() / 64);
12289   SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12290   for (int i = 0; i < NumLanes; ++i)
12291     if (Lanes[i] >= 0) {
12292       LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12293       LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12294     }
12295
12296   V1 = DAG.getBitcast(LaneVT, V1);
12297   V2 = DAG.getBitcast(LaneVT, V2);
12298   SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12299
12300   // Cast it back to the type we actually want.
12301   LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12302
12303   // Now do a simple shuffle that isn't lane crossing.
12304   SmallVector<int, 8> NewMask((unsigned)Size, -1);
12305   for (int i = 0; i < Size; ++i)
12306     if (Mask[i] >= 0)
12307       NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12308   assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12309          "Must not introduce lane crosses at this point!");
12310
12311   return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12312 }
12313
12314 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12315 /// This allows for fast cases such as subvector extraction/insertion
12316 /// or shuffling smaller vector types which can lower more efficiently.
12317 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12318                                                SDValue V1, SDValue V2,
12319                                                ArrayRef<int> Mask,
12320                                                const X86Subtarget &Subtarget,
12321                                                SelectionDAG &DAG) {
12322   assert(VT.is256BitVector() && "Expected 256-bit vector");
12323
12324   unsigned NumElts = VT.getVectorNumElements();
12325   unsigned HalfNumElts = NumElts / 2;
12326   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12327
12328   bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12329   bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12330   if (!UndefLower && !UndefUpper)
12331     return SDValue();
12332
12333   // Upper half is undef and lower half is whole upper subvector.
12334   // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12335   if (UndefUpper &&
12336       isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12337     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12338                              DAG.getIntPtrConstant(HalfNumElts, DL));
12339     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12340                        DAG.getIntPtrConstant(0, DL));
12341   }
12342
12343   // Lower half is undef and upper half is whole lower subvector.
12344   // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12345   if (UndefLower &&
12346       isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12347     SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12348                              DAG.getIntPtrConstant(0, DL));
12349     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12350                        DAG.getIntPtrConstant(HalfNumElts, DL));
12351   }
12352
12353   // If the shuffle only uses two of the four halves of the input operands,
12354   // then extract them and perform the 'half' shuffle at half width.
12355   // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12356   int HalfIdx1 = -1, HalfIdx2 = -1;
12357   SmallVector<int, 8> HalfMask(HalfNumElts);
12358   unsigned Offset = UndefLower ? HalfNumElts : 0;
12359   for (unsigned i = 0; i != HalfNumElts; ++i) {
12360     int M = Mask[i + Offset];
12361     if (M < 0) {
12362       HalfMask[i] = M;
12363       continue;
12364     }
12365
12366     // Determine which of the 4 half vectors this element is from.
12367     // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12368     int HalfIdx = M / HalfNumElts;
12369
12370     // Determine the element index into its half vector source.
12371     int HalfElt = M % HalfNumElts;
12372
12373     // We can shuffle with up to 2 half vectors, set the new 'half'
12374     // shuffle mask accordingly.
12375     if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12376       HalfMask[i] = HalfElt;
12377       HalfIdx1 = HalfIdx;
12378       continue;
12379     }
12380     if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12381       HalfMask[i] = HalfElt + HalfNumElts;
12382       HalfIdx2 = HalfIdx;
12383       continue;
12384     }
12385
12386     // Too many half vectors referenced.
12387     return SDValue();
12388   }
12389   assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12390
12391   // Only shuffle the halves of the inputs when useful.
12392   int NumLowerHalves =
12393       (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12394   int NumUpperHalves =
12395       (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12396
12397   // uuuuXXXX - don't extract uppers just to insert again.
12398   if (UndefLower && NumUpperHalves != 0)
12399     return SDValue();
12400
12401   // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12402   if (UndefUpper && NumUpperHalves == 2)
12403     return SDValue();
12404
12405   // AVX2 - XXXXuuuu - always extract lowers.
12406   if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12407     // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12408     if (VT == MVT::v4f64 || VT == MVT::v4i64)
12409       return SDValue();
12410     // AVX2 supports variable 32-bit element cross-lane shuffles.
12411     if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12412       // XXXXuuuu - don't extract lowers and uppers.
12413       if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12414         return SDValue();
12415     }
12416   }
12417
12418   auto GetHalfVector = [&](int HalfIdx) {
12419     if (HalfIdx < 0)
12420       return DAG.getUNDEF(HalfVT);
12421     SDValue V = (HalfIdx < 2 ? V1 : V2);
12422     HalfIdx = (HalfIdx % 2) * HalfNumElts;
12423     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12424                        DAG.getIntPtrConstant(HalfIdx, DL));
12425   };
12426
12427   SDValue Half1 = GetHalfVector(HalfIdx1);
12428   SDValue Half2 = GetHalfVector(HalfIdx2);
12429   SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12430   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12431                      DAG.getIntPtrConstant(Offset, DL));
12432 }
12433
12434 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12435 /// given mask.
12436 ///
12437 /// This returns true if the elements from a particular input are already in the
12438 /// slot required by the given mask and require no permutation.
12439 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12440   assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12441   int Size = Mask.size();
12442   for (int i = 0; i < Size; ++i)
12443     if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12444       return false;
12445
12446   return true;
12447 }
12448
12449 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12450 /// every lane can be represented as the same repeating mask - allowing us to
12451 /// shuffle the sources with the repeating shuffle and then permute the result
12452 /// to the destination lanes.
12453 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12454     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12455     const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12456   int NumElts = VT.getVectorNumElements();
12457   int NumLanes = VT.getSizeInBits() / 128;
12458   int NumLaneElts = NumElts / NumLanes;
12459
12460   // On AVX2 we may be able to just shuffle the lowest elements and then
12461   // broadcast the result.
12462   if (Subtarget.hasAVX2()) {
12463     for (unsigned BroadcastSize : {16, 32, 64}) {
12464       if (BroadcastSize <= VT.getScalarSizeInBits())
12465         continue;
12466       int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12467
12468       // Attempt to match a repeating pattern every NumBroadcastElts,
12469       // accounting for UNDEFs but only references the lowest 128-bit
12470       // lane of the inputs.
12471       auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12472         for (int i = 0; i != NumElts; i += NumBroadcastElts)
12473           for (int j = 0; j != NumBroadcastElts; ++j) {
12474             int M = Mask[i + j];
12475             if (M < 0)
12476               continue;
12477             int &R = RepeatMask[j];
12478             if (0 != ((M % NumElts) / NumLaneElts))
12479               return false;
12480             if (0 <= R && R != M)
12481               return false;
12482             R = M;
12483           }
12484         return true;
12485       };
12486
12487       SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12488       if (!FindRepeatingBroadcastMask(RepeatMask))
12489         continue;
12490
12491       // Shuffle the (lowest) repeated elements in place for broadcast.
12492       SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12493
12494       // Shuffle the actual broadcast.
12495       SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12496       for (int i = 0; i != NumElts; i += NumBroadcastElts)
12497         for (int j = 0; j != NumBroadcastElts; ++j)
12498           BroadcastMask[i + j] = j;
12499       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12500                                   BroadcastMask);
12501     }
12502   }
12503
12504   // Bail if the shuffle mask doesn't cross 128-bit lanes.
12505   if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12506     return SDValue();
12507
12508   // Bail if we already have a repeated lane shuffle mask.
12509   SmallVector<int, 8> RepeatedShuffleMask;
12510   if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12511     return SDValue();
12512
12513   // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12514   // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12515   int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12516   int NumSubLanes = NumLanes * SubLaneScale;
12517   int NumSubLaneElts = NumLaneElts / SubLaneScale;
12518
12519   // Check that all the sources are coming from the same lane and see if we can
12520   // form a repeating shuffle mask (local to each sub-lane). At the same time,
12521   // determine the source sub-lane for each destination sub-lane.
12522   int TopSrcSubLane = -1;
12523   SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12524   SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12525       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12526       SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12527
12528   for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12529     // Extract the sub-lane mask, check that it all comes from the same lane
12530     // and normalize the mask entries to come from the first lane.
12531     int SrcLane = -1;
12532     SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12533     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12534       int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12535       if (M < 0)
12536         continue;
12537       int Lane = (M % NumElts) / NumLaneElts;
12538       if ((0 <= SrcLane) && (SrcLane != Lane))
12539         return SDValue();
12540       SrcLane = Lane;
12541       int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12542       SubLaneMask[Elt] = LocalM;
12543     }
12544
12545     // Whole sub-lane is UNDEF.
12546     if (SrcLane < 0)
12547       continue;
12548
12549     // Attempt to match against the candidate repeated sub-lane masks.
12550     for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12551       auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12552         for (int i = 0; i != NumSubLaneElts; ++i) {
12553           if (M1[i] < 0 || M2[i] < 0)
12554             continue;
12555           if (M1[i] != M2[i])
12556             return false;
12557         }
12558         return true;
12559       };
12560
12561       auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12562       if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12563         continue;
12564
12565       // Merge the sub-lane mask into the matching repeated sub-lane mask.
12566       for (int i = 0; i != NumSubLaneElts; ++i) {
12567         int M = SubLaneMask[i];
12568         if (M < 0)
12569           continue;
12570         assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12571                "Unexpected mask element");
12572         RepeatedSubLaneMask[i] = M;
12573       }
12574
12575       // Track the top most source sub-lane - by setting the remaining to UNDEF
12576       // we can greatly simplify shuffle matching.
12577       int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12578       TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12579       Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12580       break;
12581     }
12582
12583     // Bail if we failed to find a matching repeated sub-lane mask.
12584     if (Dst2SrcSubLanes[DstSubLane] < 0)
12585       return SDValue();
12586   }
12587   assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12588          "Unexpected source lane");
12589
12590   // Create a repeating shuffle mask for the entire vector.
12591   SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12592   for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12593     int Lane = SubLane / SubLaneScale;
12594     auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12595     for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12596       int M = RepeatedSubLaneMask[Elt];
12597       if (M < 0)
12598         continue;
12599       int Idx = (SubLane * NumSubLaneElts) + Elt;
12600       RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12601     }
12602   }
12603   SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12604
12605   // Shuffle each source sub-lane to its destination.
12606   SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12607   for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12608     int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12609     if (SrcSubLane < 0)
12610       continue;
12611     for (int j = 0; j != NumSubLaneElts; ++j)
12612       SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12613   }
12614
12615   return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12616                               SubLaneMask);
12617 }
12618
12619 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12620                                          unsigned &ShuffleImm,
12621                                          ArrayRef<int> Mask) {
12622   int NumElts = VT.getVectorNumElements();
12623   assert(VT.getScalarSizeInBits() == 64 &&
12624          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12625          "Unexpected data type for VSHUFPD");
12626
12627   // Mask for V8F64: 0/1,  8/9,  2/3,  10/11, 4/5, ..
12628   // Mask for V4F64; 0/1,  4/5,  2/3,  6/7..
12629   ShuffleImm = 0;
12630   bool ShufpdMask = true;
12631   bool CommutableMask = true;
12632   for (int i = 0; i < NumElts; ++i) {
12633     if (Mask[i] == SM_SentinelUndef)
12634       continue;
12635     if (Mask[i] < 0)
12636       return false;
12637     int Val = (i & 6) + NumElts * (i & 1);
12638     int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12639     if (Mask[i] < Val || Mask[i] > Val + 1)
12640       ShufpdMask = false;
12641     if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12642       CommutableMask = false;
12643     ShuffleImm |= (Mask[i] % 2) << i;
12644   }
12645
12646   if (ShufpdMask)
12647     return true;
12648   if (CommutableMask) {
12649     std::swap(V1, V2);
12650     return true;
12651   }
12652
12653   return false;
12654 }
12655
12656 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12657                                             ArrayRef<int> Mask, SDValue V1,
12658                                             SDValue V2, SelectionDAG &DAG) {
12659   assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12660          "Unexpected data type for VSHUFPD");
12661
12662   unsigned Immediate = 0;
12663   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12664     return SDValue();
12665
12666   return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12667                      DAG.getConstant(Immediate, DL, MVT::i8));
12668 }
12669
12670 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12671                                            ArrayRef<int> Mask, SDValue V1,
12672                                            SDValue V2, SelectionDAG &DAG) {
12673   MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12674   MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12675
12676   SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12677   if (V2.isUndef())
12678     return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12679
12680   return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12681 }
12682
12683 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12684 ///
12685 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12686 /// isn't available.
12687 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12688                                        const APInt &Zeroable,
12689                                        SDValue V1, SDValue V2,
12690                                        const X86Subtarget &Subtarget,
12691                                        SelectionDAG &DAG) {
12692   assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12693   assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12694   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12695
12696   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12697                                            Zeroable, Subtarget, DAG))
12698     return V;
12699
12700   if (V2.isUndef()) {
12701     // Check for being able to broadcast a single element.
12702     if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12703             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12704       return Broadcast;
12705
12706     // Use low duplicate instructions for masks that match their pattern.
12707     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12708       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12709
12710     if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12711       // Non-half-crossing single input shuffles can be lowered with an
12712       // interleaved permutation.
12713       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12714                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12715       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12716                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12717     }
12718
12719     // With AVX2 we have direct support for this permutation.
12720     if (Subtarget.hasAVX2())
12721       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12722                          getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12723
12724     // Try to create an in-lane repeating shuffle mask and then shuffle the
12725     // the results into the target lanes.
12726     if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12727             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12728       return V;
12729
12730     // Otherwise, fall back.
12731     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12732                                                    DAG);
12733   }
12734
12735   // Use dedicated unpack instructions for masks that match their pattern.
12736   if (SDValue V =
12737           lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12738     return V;
12739
12740   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12741                                                 Zeroable, Subtarget, DAG))
12742     return Blend;
12743
12744   // Check if the blend happens to exactly fit that of SHUFPD.
12745   if (SDValue Op =
12746       lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12747     return Op;
12748
12749   // Try to create an in-lane repeating shuffle mask and then shuffle the
12750   // the results into the target lanes.
12751   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12752           DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12753     return V;
12754
12755   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12756   // shuffle. However, if we have AVX2 and either inputs are already in place,
12757   // we will be able to shuffle even across lanes the other input in a single
12758   // instruction so skip this pattern.
12759   if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12760                                 isShuffleMaskInputInPlace(1, Mask))))
12761     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12762             DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12763       return Result;
12764   // If we have VLX support, we can use VEXPAND.
12765   if (Subtarget.hasVLX())
12766     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12767                                                V1, V2, DAG, Subtarget))
12768       return V;
12769
12770   // If we have AVX2 then we always want to lower with a blend because an v4 we
12771   // can fully permute the elements.
12772   if (Subtarget.hasAVX2())
12773     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12774                                                       Mask, DAG);
12775
12776   // Otherwise fall back on generic lowering.
12777   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12778 }
12779
12780 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12781 ///
12782 /// This routine is only called when we have AVX2 and thus a reasonable
12783 /// instruction set for v4i64 shuffling..
12784 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12785                                        const APInt &Zeroable,
12786                                        SDValue V1, SDValue V2,
12787                                        const X86Subtarget &Subtarget,
12788                                        SelectionDAG &DAG) {
12789   assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12790   assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12791   assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12792   assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12793
12794   if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12795                                            Zeroable, Subtarget, DAG))
12796     return V;
12797
12798   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12799                                                 Zeroable, Subtarget, DAG))
12800     return Blend;
12801
12802   // Check for being able to broadcast a single element.
12803   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12804                                                         Mask, Subtarget, DAG))
12805     return Broadcast;
12806
12807   if (V2.isUndef()) {
12808     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12809     // can use lower latency instructions that will operate on both lanes.
12810     SmallVector<int, 2> RepeatedMask;
12811     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12812       SmallVector<int, 4> PSHUFDMask;
12813       scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12814       return DAG.getBitcast(
12815           MVT::v4i64,
12816           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12817                       DAG.getBitcast(MVT::v8i32, V1),
12818                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12819     }
12820
12821     // AVX2 provides a direct instruction for permuting a single input across
12822     // lanes.
12823     return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12824                        getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12825   }
12826
12827   // Try to use shift instructions.
12828   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12829                                                 Zeroable, Subtarget, DAG))
12830     return Shift;
12831
12832   // If we have VLX support, we can use VALIGN or VEXPAND.
12833   if (Subtarget.hasVLX()) {
12834     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12835                                                     Mask, Subtarget, DAG))
12836       return Rotate;
12837
12838     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12839                                                V1, V2, DAG, Subtarget))
12840       return V;
12841   }
12842
12843   // Try to use PALIGNR.
12844   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12845                                                       Mask, Subtarget, DAG))
12846     return Rotate;
12847
12848   // Use dedicated unpack instructions for masks that match their pattern.
12849   if (SDValue V =
12850           lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12851     return V;
12852
12853   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12854   // shuffle. However, if we have AVX2 and either inputs are already in place,
12855   // we will be able to shuffle even across lanes the other input in a single
12856   // instruction so skip this pattern.
12857   if (!isShuffleMaskInputInPlace(0, Mask) &&
12858       !isShuffleMaskInputInPlace(1, Mask))
12859     if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12860             DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12861       return Result;
12862
12863   // Otherwise fall back on generic blend lowering.
12864   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12865                                                     Mask, DAG);
12866 }
12867
12868 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12869 ///
12870 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12871 /// isn't available.
12872 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12873                                        const APInt &Zeroable,
12874                                        SDValue V1, SDValue V2,
12875                                        const X86Subtarget &Subtarget,
12876                                        SelectionDAG &DAG) {
12877   assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12878   assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12879   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12880
12881   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12882                                                 Zeroable, Subtarget, DAG))
12883     return Blend;
12884
12885   // Check for being able to broadcast a single element.
12886   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12887                                                         Mask, Subtarget, DAG))
12888     return Broadcast;
12889
12890   // If the shuffle mask is repeated in each 128-bit lane, we have many more
12891   // options to efficiently lower the shuffle.
12892   SmallVector<int, 4> RepeatedMask;
12893   if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12894     assert(RepeatedMask.size() == 4 &&
12895            "Repeated masks must be half the mask width!");
12896
12897     // Use even/odd duplicate instructions for masks that match their pattern.
12898     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12899       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12900     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12901       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12902
12903     if (V2.isUndef())
12904       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12905                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12906
12907     // Use dedicated unpack instructions for masks that match their pattern.
12908     if (SDValue V =
12909             lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12910       return V;
12911
12912     // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12913     // have already handled any direct blends.
12914     return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12915   }
12916
12917   // Try to create an in-lane repeating shuffle mask and then shuffle the
12918   // the results into the target lanes.
12919   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12920           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12921     return V;
12922
12923   // If we have a single input shuffle with different shuffle patterns in the
12924   // two 128-bit lanes use the variable mask to VPERMILPS.
12925   if (V2.isUndef()) {
12926     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12927     if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12928       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12929
12930     if (Subtarget.hasAVX2())
12931       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12932
12933     // Otherwise, fall back.
12934     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12935                                                    DAG);
12936   }
12937
12938   // Try to simplify this by merging 128-bit lanes to enable a lane-based
12939   // shuffle.
12940   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12941           DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12942     return Result;
12943   // If we have VLX support, we can use VEXPAND.
12944   if (Subtarget.hasVLX())
12945     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12946                                                V1, V2, DAG, Subtarget))
12947       return V;
12948
12949   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12950   // since after split we get a more efficient code using vpunpcklwd and
12951   // vpunpckhwd instrs than vblend.
12952   if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12953     if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12954                                                      Mask, DAG))
12955       return V;
12956
12957   // If we have AVX2 then we always want to lower with a blend because at v8 we
12958   // can fully permute the elements.
12959   if (Subtarget.hasAVX2())
12960     return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12961                                                       Mask, DAG);
12962
12963   // Otherwise fall back on generic lowering.
12964   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12965 }
12966
12967 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12968 ///
12969 /// This routine is only called when we have AVX2 and thus a reasonable
12970 /// instruction set for v8i32 shuffling..
12971 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12972                                        const APInt &Zeroable,
12973                                        SDValue V1, SDValue V2,
12974                                        const X86Subtarget &Subtarget,
12975                                        SelectionDAG &DAG) {
12976   assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12977   assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12978   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12979   assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12980
12981   // Whenever we can lower this as a zext, that instruction is strictly faster
12982   // than any alternative. It also allows us to fold memory operands into the
12983   // shuffle in many cases.
12984   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12985           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12986     return ZExt;
12987
12988   // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12989   // since after split we get a more efficient code than vblend by using
12990   // vpunpcklwd and vpunpckhwd instrs.
12991   if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12992       !Subtarget.hasAVX512())
12993     if (SDValue V =
12994             lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12995       return V;
12996
12997   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12998                                                 Zeroable, Subtarget, DAG))
12999     return Blend;
13000
13001   // Check for being able to broadcast a single element.
13002   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13003                                                         Mask, Subtarget, DAG))
13004     return Broadcast;
13005
13006   // If the shuffle mask is repeated in each 128-bit lane we can use more
13007   // efficient instructions that mirror the shuffles across the two 128-bit
13008   // lanes.
13009   SmallVector<int, 4> RepeatedMask;
13010   bool Is128BitLaneRepeatedShuffle =
13011       is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13012   if (Is128BitLaneRepeatedShuffle) {
13013     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13014     if (V2.isUndef())
13015       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13016                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13017
13018     // Use dedicated unpack instructions for masks that match their pattern.
13019     if (SDValue V =
13020             lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13021       return V;
13022   }
13023
13024   // Try to use shift instructions.
13025   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13026                                                 Zeroable, Subtarget, DAG))
13027     return Shift;
13028
13029   // If we have VLX support, we can use VALIGN or EXPAND.
13030   if (Subtarget.hasVLX()) {
13031     if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13032                                                     Mask, Subtarget, DAG))
13033       return Rotate;
13034
13035     if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13036                                                V1, V2, DAG, Subtarget))
13037       return V;
13038   }
13039
13040   // Try to use byte rotation instructions.
13041   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13042           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13043     return Rotate;
13044
13045   // Try to create an in-lane repeating shuffle mask and then shuffle the
13046   // results into the target lanes.
13047   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13048           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13049     return V;
13050
13051   // If the shuffle patterns aren't repeated but it is a single input, directly
13052   // generate a cross-lane VPERMD instruction.
13053   if (V2.isUndef()) {
13054     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13055     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13056   }
13057
13058   // Assume that a single SHUFPS is faster than an alternative sequence of
13059   // multiple instructions (even if the CPU has a domain penalty).
13060   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13061   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13062     SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13063     SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13064     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13065                                                   CastV1, CastV2, DAG);
13066     return DAG.getBitcast(MVT::v8i32, ShufPS);
13067   }
13068
13069   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13070   // shuffle.
13071   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13072           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13073     return Result;
13074
13075   // Otherwise fall back on generic blend lowering.
13076   return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13077                                                     Mask, DAG);
13078 }
13079
13080 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13081 ///
13082 /// This routine is only called when we have AVX2 and thus a reasonable
13083 /// instruction set for v16i16 shuffling..
13084 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13085                                         const APInt &Zeroable,
13086                                         SDValue V1, SDValue V2,
13087                                         const X86Subtarget &Subtarget,
13088                                         SelectionDAG &DAG) {
13089   assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13090   assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13091   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13092   assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13093
13094   // Whenever we can lower this as a zext, that instruction is strictly faster
13095   // than any alternative. It also allows us to fold memory operands into the
13096   // shuffle in many cases.
13097   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13098           DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13099     return ZExt;
13100
13101   // Check for being able to broadcast a single element.
13102   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13103                                                         Mask, Subtarget, DAG))
13104     return Broadcast;
13105
13106   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13107                                                 Zeroable, Subtarget, DAG))
13108     return Blend;
13109
13110   // Use dedicated unpack instructions for masks that match their pattern.
13111   if (SDValue V =
13112           lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13113     return V;
13114
13115   // Try to use shift instructions.
13116   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13117                                                 Zeroable, Subtarget, DAG))
13118     return Shift;
13119
13120   // Try to use byte rotation instructions.
13121   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13122           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13123     return Rotate;
13124
13125   // Try to create an in-lane repeating shuffle mask and then shuffle the
13126   // the results into the target lanes.
13127   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13128           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13129     return V;
13130
13131   if (V2.isUndef()) {
13132     // There are no generalized cross-lane shuffle operations available on i16
13133     // element types.
13134     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13135       return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13136                                                      Mask, DAG);
13137
13138     SmallVector<int, 8> RepeatedMask;
13139     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13140       // As this is a single-input shuffle, the repeated mask should be
13141       // a strictly valid v8i16 mask that we can pass through to the v8i16
13142       // lowering to handle even the v16 case.
13143       return lowerV8I16GeneralSingleInputVectorShuffle(
13144           DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13145     }
13146   }
13147
13148   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13149           DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13150     return PSHUFB;
13151
13152   // AVX512BWVL can lower to VPERMW.
13153   if (Subtarget.hasBWI() && Subtarget.hasVLX())
13154     return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13155
13156   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13157   // shuffle.
13158   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13159           DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13160     return Result;
13161
13162   // Otherwise fall back on generic lowering.
13163   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13164 }
13165
13166 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13167 ///
13168 /// This routine is only called when we have AVX2 and thus a reasonable
13169 /// instruction set for v32i8 shuffling..
13170 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13171                                        const APInt &Zeroable,
13172                                        SDValue V1, SDValue V2,
13173                                        const X86Subtarget &Subtarget,
13174                                        SelectionDAG &DAG) {
13175   assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13176   assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13177   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13178   assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13179
13180   // Whenever we can lower this as a zext, that instruction is strictly faster
13181   // than any alternative. It also allows us to fold memory operands into the
13182   // shuffle in many cases.
13183   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13184           DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13185     return ZExt;
13186
13187   // Check for being able to broadcast a single element.
13188   if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13189                                                         Mask, Subtarget, DAG))
13190     return Broadcast;
13191
13192   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13193                                                 Zeroable, Subtarget, DAG))
13194     return Blend;
13195
13196   // Use dedicated unpack instructions for masks that match their pattern.
13197   if (SDValue V =
13198           lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13199     return V;
13200
13201   // Try to use shift instructions.
13202   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13203                                                 Zeroable, Subtarget, DAG))
13204     return Shift;
13205
13206   // Try to use byte rotation instructions.
13207   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13208           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13209     return Rotate;
13210
13211   // Try to create an in-lane repeating shuffle mask and then shuffle the
13212   // the results into the target lanes.
13213   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13214           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13215     return V;
13216
13217   // There are no generalized cross-lane shuffle operations available on i8
13218   // element types.
13219   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13220     return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13221                                                    DAG);
13222
13223   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13224           DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13225     return PSHUFB;
13226
13227   // Try to simplify this by merging 128-bit lanes to enable a lane-based
13228   // shuffle.
13229   if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13230           DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13231     return Result;
13232
13233   // Otherwise fall back on generic lowering.
13234   return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13235 }
13236
13237 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13238 ///
13239 /// This routine either breaks down the specific type of a 256-bit x86 vector
13240 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13241 /// together based on the available instructions.
13242 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13243                                         MVT VT, SDValue V1, SDValue V2,
13244                                         const APInt &Zeroable,
13245                                         const X86Subtarget &Subtarget,
13246                                         SelectionDAG &DAG) {
13247   // If we have a single input to the zero element, insert that into V1 if we
13248   // can do so cheaply.
13249   int NumElts = VT.getVectorNumElements();
13250   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13251
13252   if (NumV2Elements == 1 && Mask[0] >= NumElts)
13253     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13254             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13255       return Insertion;
13256
13257   // Handle special cases where the lower or upper half is UNDEF.
13258   if (SDValue V =
13259           lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13260     return V;
13261
13262   // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13263   // can check for those subtargets here and avoid much of the subtarget
13264   // querying in the per-vector-type lowering routines. With AVX1 we have
13265   // essentially *zero* ability to manipulate a 256-bit vector with integer
13266   // types. Since we'll use floating point types there eventually, just
13267   // immediately cast everything to a float and operate entirely in that domain.
13268   if (VT.isInteger() && !Subtarget.hasAVX2()) {
13269     int ElementBits = VT.getScalarSizeInBits();
13270     if (ElementBits < 32) {
13271       // No floating point type available, if we can't use the bit operations
13272       // for masking/blending then decompose into 128-bit vectors.
13273       if (SDValue V =
13274               lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13275         return V;
13276       if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13277         return V;
13278       return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13279     }
13280
13281     MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13282                                 VT.getVectorNumElements());
13283     V1 = DAG.getBitcast(FpVT, V1);
13284     V2 = DAG.getBitcast(FpVT, V2);
13285     return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13286   }
13287
13288   switch (VT.SimpleTy) {
13289   case MVT::v4f64:
13290     return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13291   case MVT::v4i64:
13292     return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13293   case MVT::v8f32:
13294     return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13295   case MVT::v8i32:
13296     return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13297   case MVT::v16i16:
13298     return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13299   case MVT::v32i8:
13300     return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13301
13302   default:
13303     llvm_unreachable("Not a valid 256-bit x86 vector type!");
13304   }
13305 }
13306
13307 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13308 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13309                                         ArrayRef<int> Mask, SDValue V1,
13310                                         SDValue V2, SelectionDAG &DAG) {
13311   assert(VT.getScalarSizeInBits() == 64 &&
13312          "Unexpected element type size for 128bit shuffle.");
13313
13314   // To handle 256 bit vector requires VLX and most probably
13315   // function lowerV2X128VectorShuffle() is better solution.
13316   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13317
13318   SmallVector<int, 4> WidenedMask;
13319   if (!canWidenShuffleElements(Mask, WidenedMask))
13320     return SDValue();
13321
13322   // Check for patterns which can be matched with a single insert of a 256-bit
13323   // subvector.
13324   bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13325                                         {0, 1, 2, 3, 0, 1, 2, 3});
13326   if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13327                                         {0, 1, 2, 3, 8, 9, 10, 11})) {
13328     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13329     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13330                               DAG.getIntPtrConstant(0, DL));
13331     SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13332                               OnlyUsesV1 ? V1 : V2,
13333                               DAG.getIntPtrConstant(0, DL));
13334     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13335   }
13336
13337   assert(WidenedMask.size() == 4);
13338
13339   // See if this is an insertion of the lower 128-bits of V2 into V1.
13340   bool IsInsert = true;
13341   int V2Index = -1;
13342   for (int i = 0; i < 4; ++i) {
13343     assert(WidenedMask[i] >= -1);
13344     if (WidenedMask[i] < 0)
13345       continue;
13346
13347     // Make sure all V1 subvectors are in place.
13348     if (WidenedMask[i] < 4) {
13349       if (WidenedMask[i] != i) {
13350         IsInsert = false;
13351         break;
13352       }
13353     } else {
13354       // Make sure we only have a single V2 index and its the lowest 128-bits.
13355       if (V2Index >= 0 || WidenedMask[i] != 4) {
13356         IsInsert = false;
13357         break;
13358       }
13359       V2Index = i;
13360     }
13361   }
13362   if (IsInsert && V2Index >= 0) {
13363     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13364     SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13365                                  DAG.getIntPtrConstant(0, DL));
13366     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13367   }
13368
13369   // Try to lower to to vshuf64x2/vshuf32x4.
13370   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13371   unsigned PermMask = 0;
13372   // Insure elements came from the same Op.
13373   for (int i = 0; i < 4; ++i) {
13374     assert(WidenedMask[i] >= -1);
13375     if (WidenedMask[i] < 0)
13376       continue;
13377
13378     SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13379     unsigned OpIndex = i / 2;
13380     if (Ops[OpIndex].isUndef())
13381       Ops[OpIndex] = Op;
13382     else if (Ops[OpIndex] != Op)
13383       return SDValue();
13384
13385     // Convert the 128-bit shuffle mask selection values into 128-bit selection
13386     // bits defined by a vshuf64x2 instruction's immediate control byte.
13387     PermMask |= (WidenedMask[i] % 4) << (i * 2);
13388   }
13389
13390   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13391                      DAG.getConstant(PermMask, DL, MVT::i8));
13392 }
13393
13394 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13395 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13396                                        const APInt &Zeroable,
13397                                        SDValue V1, SDValue V2,
13398                                        const X86Subtarget &Subtarget,
13399                                        SelectionDAG &DAG) {
13400   assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13401   assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13402   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13403
13404   if (V2.isUndef()) {
13405     // Use low duplicate instructions for masks that match their pattern.
13406     if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13407       return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13408
13409     if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13410       // Non-half-crossing single input shuffles can be lowered with an
13411       // interleaved permutation.
13412       unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13413                               ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13414                               ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13415                               ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13416       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13417                          DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13418     }
13419
13420     SmallVector<int, 4> RepeatedMask;
13421     if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13422       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13423                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13424   }
13425
13426   if (SDValue Shuf128 =
13427           lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13428     return Shuf128;
13429
13430   if (SDValue Unpck =
13431           lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13432     return Unpck;
13433
13434   // Check if the blend happens to exactly fit that of SHUFPD.
13435   if (SDValue Op =
13436       lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13437     return Op;
13438
13439   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13440                                              V2, DAG, Subtarget))
13441     return V;
13442
13443   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13444                                                 Zeroable, Subtarget, DAG))
13445     return Blend;
13446
13447   return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13448 }
13449
13450 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13451 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13452                                         const APInt &Zeroable,
13453                                         SDValue V1, SDValue V2,
13454                                         const X86Subtarget &Subtarget,
13455                                         SelectionDAG &DAG) {
13456   assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13457   assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13458   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13459
13460   // If the shuffle mask is repeated in each 128-bit lane, we have many more
13461   // options to efficiently lower the shuffle.
13462   SmallVector<int, 4> RepeatedMask;
13463   if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13464     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13465
13466     // Use even/odd duplicate instructions for masks that match their pattern.
13467     if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13468       return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13469     if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13470       return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13471
13472     if (V2.isUndef())
13473       return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13474                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13475
13476     // Use dedicated unpack instructions for masks that match their pattern.
13477     if (SDValue Unpck =
13478             lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13479       return Unpck;
13480
13481     if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13482                                                   Zeroable, Subtarget, DAG))
13483       return Blend;
13484
13485     // Otherwise, fall back to a SHUFPS sequence.
13486     return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13487   }
13488   // If we have AVX512F support, we can use VEXPAND.
13489   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13490                                              V1, V2, DAG, Subtarget))
13491     return V;
13492
13493   return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13494 }
13495
13496 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13497 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13498                                        const APInt &Zeroable,
13499                                        SDValue V1, SDValue V2,
13500                                        const X86Subtarget &Subtarget,
13501                                        SelectionDAG &DAG) {
13502   assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13503   assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13504   assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13505
13506   if (SDValue Shuf128 =
13507           lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13508     return Shuf128;
13509
13510   if (V2.isUndef()) {
13511     // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13512     // can use lower latency instructions that will operate on all four
13513     // 128-bit lanes.
13514     SmallVector<int, 2> Repeated128Mask;
13515     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13516       SmallVector<int, 4> PSHUFDMask;
13517       scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13518       return DAG.getBitcast(
13519           MVT::v8i64,
13520           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13521                       DAG.getBitcast(MVT::v16i32, V1),
13522                       getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13523     }
13524
13525     SmallVector<int, 4> Repeated256Mask;
13526     if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13527       return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13528                          getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13529   }
13530
13531   // Try to use shift instructions.
13532   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13533                                                 Zeroable, Subtarget, DAG))
13534     return Shift;
13535
13536   // Try to use VALIGN.
13537   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13538                                                   Mask, Subtarget, DAG))
13539     return Rotate;
13540
13541   // Try to use PALIGNR.
13542   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13543                                                       Mask, Subtarget, DAG))
13544     return Rotate;
13545
13546   if (SDValue Unpck =
13547           lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13548     return Unpck;
13549   // If we have AVX512F support, we can use VEXPAND.
13550   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13551                                              V2, DAG, Subtarget))
13552     return V;
13553
13554   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13555                                                 Zeroable, Subtarget, DAG))
13556     return Blend;
13557
13558   return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13559 }
13560
13561 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13562 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13563                                         const APInt &Zeroable,
13564                                         SDValue V1, SDValue V2,
13565                                         const X86Subtarget &Subtarget,
13566                                         SelectionDAG &DAG) {
13567   assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13568   assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13569   assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13570
13571   // Whenever we can lower this as a zext, that instruction is strictly faster
13572   // than any alternative. It also allows us to fold memory operands into the
13573   // shuffle in many cases.
13574   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13575           DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13576     return ZExt;
13577
13578   // If the shuffle mask is repeated in each 128-bit lane we can use more
13579   // efficient instructions that mirror the shuffles across the four 128-bit
13580   // lanes.
13581   SmallVector<int, 4> RepeatedMask;
13582   bool Is128BitLaneRepeatedShuffle =
13583       is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13584   if (Is128BitLaneRepeatedShuffle) {
13585     assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13586     if (V2.isUndef())
13587       return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13588                          getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13589
13590     // Use dedicated unpack instructions for masks that match their pattern.
13591     if (SDValue V =
13592             lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13593       return V;
13594   }
13595
13596   // Try to use shift instructions.
13597   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13598                                                 Zeroable, Subtarget, DAG))
13599     return Shift;
13600
13601   // Try to use VALIGN.
13602   if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13603                                                   Mask, Subtarget, DAG))
13604     return Rotate;
13605
13606   // Try to use byte rotation instructions.
13607   if (Subtarget.hasBWI())
13608     if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13609             DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13610       return Rotate;
13611
13612   // Assume that a single SHUFPS is faster than using a permv shuffle.
13613   // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13614   if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13615     SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13616     SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13617     SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13618                                                   CastV1, CastV2, DAG);
13619     return DAG.getBitcast(MVT::v16i32, ShufPS);
13620   }
13621   // If we have AVX512F support, we can use VEXPAND.
13622   if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13623                                              V1, V2, DAG, Subtarget))
13624     return V;
13625
13626   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13627                                                 Zeroable, Subtarget, DAG))
13628     return Blend;
13629   return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13630 }
13631
13632 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13633 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13634                                         const APInt &Zeroable,
13635                                         SDValue V1, SDValue V2,
13636                                         const X86Subtarget &Subtarget,
13637                                         SelectionDAG &DAG) {
13638   assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13639   assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13640   assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13641   assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13642
13643   // Whenever we can lower this as a zext, that instruction is strictly faster
13644   // than any alternative. It also allows us to fold memory operands into the
13645   // shuffle in many cases.
13646   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13647           DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13648     return ZExt;
13649
13650   // Use dedicated unpack instructions for masks that match their pattern.
13651   if (SDValue V =
13652           lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13653     return V;
13654
13655   // Try to use shift instructions.
13656   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13657                                                 Zeroable, Subtarget, DAG))
13658     return Shift;
13659
13660   // Try to use byte rotation instructions.
13661   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13662           DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13663     return Rotate;
13664
13665   if (V2.isUndef()) {
13666     SmallVector<int, 8> RepeatedMask;
13667     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13668       // As this is a single-input shuffle, the repeated mask should be
13669       // a strictly valid v8i16 mask that we can pass through to the v8i16
13670       // lowering to handle even the v32 case.
13671       return lowerV8I16GeneralSingleInputVectorShuffle(
13672           DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13673     }
13674   }
13675
13676   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13677                                                 Zeroable, Subtarget, DAG))
13678     return Blend;
13679
13680   return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13681 }
13682
13683 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13684 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13685                                        const APInt &Zeroable,
13686                                        SDValue V1, SDValue V2,
13687                                        const X86Subtarget &Subtarget,
13688                                        SelectionDAG &DAG) {
13689   assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13690   assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13691   assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13692   assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13693
13694   // Whenever we can lower this as a zext, that instruction is strictly faster
13695   // than any alternative. It also allows us to fold memory operands into the
13696   // shuffle in many cases.
13697   if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13698           DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13699     return ZExt;
13700
13701   // Use dedicated unpack instructions for masks that match their pattern.
13702   if (SDValue V =
13703           lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13704     return V;
13705
13706   // Try to use shift instructions.
13707   if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13708                                                 Zeroable, Subtarget, DAG))
13709     return Shift;
13710
13711   // Try to use byte rotation instructions.
13712   if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13713           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13714     return Rotate;
13715
13716   if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13717           DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13718     return PSHUFB;
13719
13720   // VBMI can use VPERMV/VPERMV3 byte shuffles.
13721   if (Subtarget.hasVBMI())
13722     return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13723
13724   // Try to create an in-lane repeating shuffle mask and then shuffle the
13725   // the results into the target lanes.
13726   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13727           DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13728     return V;
13729
13730   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13731                                                 Zeroable, Subtarget, DAG))
13732     return Blend;
13733
13734   // FIXME: Implement direct support for this type!
13735   return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13736 }
13737
13738 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13739 ///
13740 /// This routine either breaks down the specific type of a 512-bit x86 vector
13741 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13742 /// together based on the available instructions.
13743 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13744                                         MVT VT, SDValue V1, SDValue V2,
13745                                         const APInt &Zeroable,
13746                                         const X86Subtarget &Subtarget,
13747                                         SelectionDAG &DAG) {
13748   assert(Subtarget.hasAVX512() &&
13749          "Cannot lower 512-bit vectors w/ basic ISA!");
13750
13751   // If we have a single input to the zero element, insert that into V1 if we
13752   // can do so cheaply.
13753   int NumElts = Mask.size();
13754   int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13755
13756   if (NumV2Elements == 1 && Mask[0] >= NumElts)
13757     if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13758             DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13759       return Insertion;
13760
13761   // Check for being able to broadcast a single element.
13762   if (SDValue Broadcast =
13763           lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13764     return Broadcast;
13765
13766   // Dispatch to each element type for lowering. If we don't have support for
13767   // specific element type shuffles at 512 bits, immediately split them and
13768   // lower them. Each lowering routine of a given type is allowed to assume that
13769   // the requisite ISA extensions for that element type are available.
13770   switch (VT.SimpleTy) {
13771   case MVT::v8f64:
13772     return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13773   case MVT::v16f32:
13774     return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13775   case MVT::v8i64:
13776     return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13777   case MVT::v16i32:
13778     return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13779   case MVT::v32i16:
13780     return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13781   case MVT::v64i8:
13782     return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13783
13784   default:
13785     llvm_unreachable("Not a valid 512-bit x86 vector type!");
13786   }
13787 }
13788
13789 // Lower vXi1 vector shuffles.
13790 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13791 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13792 // vector, shuffle and then truncate it back.
13793 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13794                                       MVT VT, SDValue V1, SDValue V2,
13795                                       const X86Subtarget &Subtarget,
13796                                       SelectionDAG &DAG) {
13797   assert(Subtarget.hasAVX512() &&
13798          "Cannot lower 512-bit vectors w/o basic ISA!");
13799   MVT ExtVT;
13800   switch (VT.SimpleTy) {
13801   default:
13802     llvm_unreachable("Expected a vector of i1 elements");
13803   case MVT::v2i1:
13804     ExtVT = MVT::v2i64;
13805     break;
13806   case MVT::v4i1:
13807     ExtVT = MVT::v4i32;
13808     break;
13809   case MVT::v8i1:
13810     ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13811     break;
13812   case MVT::v16i1:
13813     ExtVT = MVT::v16i32;
13814     break;
13815   case MVT::v32i1:
13816     ExtVT = MVT::v32i16;
13817     break;
13818   case MVT::v64i1:
13819     ExtVT = MVT::v64i8;
13820     break;
13821   }
13822
13823   if (ISD::isBuildVectorAllZeros(V1.getNode()))
13824     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13825   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13826     V1 = getOnesVector(ExtVT, DAG, DL);
13827   else
13828     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13829
13830   if (V2.isUndef())
13831     V2 = DAG.getUNDEF(ExtVT);
13832   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13833     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13834   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13835     V2 = getOnesVector(ExtVT, DAG, DL);
13836   else
13837     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13838
13839   SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13840   // i1 was sign extended we can use X86ISD::CVT2MASK.
13841   int NumElems = VT.getVectorNumElements();
13842   if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13843       (Subtarget.hasDQI() && (NumElems < 32)))
13844     return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13845
13846   return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13847 }
13848
13849 /// Helper function that returns true if the shuffle mask should be
13850 /// commuted to improve canonicalization.
13851 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13852   int NumElements = Mask.size();
13853
13854   int NumV1Elements = 0, NumV2Elements = 0;
13855   for (int M : Mask)
13856     if (M < 0)
13857       continue;
13858     else if (M < NumElements)
13859       ++NumV1Elements;
13860     else
13861       ++NumV2Elements;
13862
13863   // Commute the shuffle as needed such that more elements come from V1 than
13864   // V2. This allows us to match the shuffle pattern strictly on how many
13865   // elements come from V1 without handling the symmetric cases.
13866   if (NumV2Elements > NumV1Elements)
13867     return true;
13868
13869   assert(NumV1Elements > 0 && "No V1 indices");
13870
13871   if (NumV2Elements == 0)
13872     return false;
13873
13874   // When the number of V1 and V2 elements are the same, try to minimize the
13875   // number of uses of V2 in the low half of the vector. When that is tied,
13876   // ensure that the sum of indices for V1 is equal to or lower than the sum
13877   // indices for V2. When those are equal, try to ensure that the number of odd
13878   // indices for V1 is lower than the number of odd indices for V2.
13879   if (NumV1Elements == NumV2Elements) {
13880     int LowV1Elements = 0, LowV2Elements = 0;
13881     for (int M : Mask.slice(0, NumElements / 2))
13882       if (M >= NumElements)
13883         ++LowV2Elements;
13884       else if (M >= 0)
13885         ++LowV1Elements;
13886     if (LowV2Elements > LowV1Elements)
13887       return true;
13888     if (LowV2Elements == LowV1Elements) {
13889       int SumV1Indices = 0, SumV2Indices = 0;
13890       for (int i = 0, Size = Mask.size(); i < Size; ++i)
13891         if (Mask[i] >= NumElements)
13892           SumV2Indices += i;
13893         else if (Mask[i] >= 0)
13894           SumV1Indices += i;
13895       if (SumV2Indices < SumV1Indices)
13896         return true;
13897       if (SumV2Indices == SumV1Indices) {
13898         int NumV1OddIndices = 0, NumV2OddIndices = 0;
13899         for (int i = 0, Size = Mask.size(); i < Size; ++i)
13900           if (Mask[i] >= NumElements)
13901             NumV2OddIndices += i % 2;
13902           else if (Mask[i] >= 0)
13903             NumV1OddIndices += i % 2;
13904         if (NumV2OddIndices < NumV1OddIndices)
13905           return true;
13906       }
13907     }
13908   }
13909
13910   return false;
13911 }
13912
13913 /// \brief Top-level lowering for x86 vector shuffles.
13914 ///
13915 /// This handles decomposition, canonicalization, and lowering of all x86
13916 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13917 /// above in helper routines. The canonicalization attempts to widen shuffles
13918 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13919 /// s.t. only one of the two inputs needs to be tested, etc.
13920 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13921                                   SelectionDAG &DAG) {
13922   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13923   ArrayRef<int> Mask = SVOp->getMask();
13924   SDValue V1 = Op.getOperand(0);
13925   SDValue V2 = Op.getOperand(1);
13926   MVT VT = Op.getSimpleValueType();
13927   int NumElements = VT.getVectorNumElements();
13928   SDLoc DL(Op);
13929   bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13930
13931   assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13932          "Can't lower MMX shuffles");
13933
13934   bool V1IsUndef = V1.isUndef();
13935   bool V2IsUndef = V2.isUndef();
13936   if (V1IsUndef && V2IsUndef)
13937     return DAG.getUNDEF(VT);
13938
13939   // When we create a shuffle node we put the UNDEF node to second operand,
13940   // but in some cases the first operand may be transformed to UNDEF.
13941   // In this case we should just commute the node.
13942   if (V1IsUndef)
13943     return DAG.getCommutedVectorShuffle(*SVOp);
13944
13945   // Check for non-undef masks pointing at an undef vector and make the masks
13946   // undef as well. This makes it easier to match the shuffle based solely on
13947   // the mask.
13948   if (V2IsUndef)
13949     for (int M : Mask)
13950       if (M >= NumElements) {
13951         SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13952         for (int &M : NewMask)
13953           if (M >= NumElements)
13954             M = -1;
13955         return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13956       }
13957
13958   // Check for illegal shuffle mask element index values.
13959   int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13960   assert(llvm::all_of(Mask,
13961                       [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13962          "Out of bounds shuffle index");
13963
13964   // We actually see shuffles that are entirely re-arrangements of a set of
13965   // zero inputs. This mostly happens while decomposing complex shuffles into
13966   // simple ones. Directly lower these as a buildvector of zeros.
13967   APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13968   if (Zeroable.isAllOnesValue())
13969     return getZeroVector(VT, Subtarget, DAG, DL);
13970
13971   // Try to collapse shuffles into using a vector type with fewer elements but
13972   // wider element types. We cap this to not form integers or floating point
13973   // elements wider than 64 bits, but it might be interesting to form i128
13974   // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13975   SmallVector<int, 16> WidenedMask;
13976   if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13977       canWidenShuffleElements(Mask, WidenedMask)) {
13978     MVT NewEltVT = VT.isFloatingPoint()
13979                        ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13980                        : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13981     MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13982     // Make sure that the new vector type is legal. For example, v2f64 isn't
13983     // legal on SSE1.
13984     if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13985       V1 = DAG.getBitcast(NewVT, V1);
13986       V2 = DAG.getBitcast(NewVT, V2);
13987       return DAG.getBitcast(
13988           VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13989     }
13990   }
13991
13992   // Commute the shuffle if it will improve canonicalization.
13993   if (canonicalizeShuffleMaskWithCommute(Mask))
13994     return DAG.getCommutedVectorShuffle(*SVOp);
13995
13996   // For each vector width, delegate to a specialized lowering routine.
13997   if (VT.is128BitVector())
13998     return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13999                                     DAG);
14000
14001   if (VT.is256BitVector())
14002     return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14003                                     DAG);
14004
14005   if (VT.is512BitVector())
14006     return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14007                                     DAG);
14008
14009   if (Is1BitVector)
14010     return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14011
14012   llvm_unreachable("Unimplemented!");
14013 }
14014
14015 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14016 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14017                                            const X86Subtarget &Subtarget,
14018                                            SelectionDAG &DAG) {
14019   SDValue Cond = Op.getOperand(0);
14020   SDValue LHS = Op.getOperand(1);
14021   SDValue RHS = Op.getOperand(2);
14022   SDLoc dl(Op);
14023   MVT VT = Op.getSimpleValueType();
14024
14025   if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14026     return SDValue();
14027   auto *CondBV = cast<BuildVectorSDNode>(Cond);
14028
14029   // Only non-legal VSELECTs reach this lowering, convert those into generic
14030   // shuffles and re-use the shuffle lowering path for blends.
14031   SmallVector<int, 32> Mask;
14032   for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14033     SDValue CondElt = CondBV->getOperand(i);
14034     Mask.push_back(
14035         isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14036                                      : -1);
14037   }
14038   return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14039 }
14040
14041 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14042   // A vselect where all conditions and data are constants can be optimized into
14043   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14044   if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14045       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14046       ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14047     return SDValue();
14048
14049   // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14050   // with patterns on the mask registers on AVX-512.
14051   if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14052     return Op;
14053
14054   // Try to lower this to a blend-style vector shuffle. This can handle all
14055   // constant condition cases.
14056   if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14057     return BlendOp;
14058
14059   // Variable blends are only legal from SSE4.1 onward.
14060   if (!Subtarget.hasSSE41())
14061     return SDValue();
14062
14063   SDLoc dl(Op);
14064   MVT VT = Op.getSimpleValueType();
14065
14066   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14067   // into an i1 condition so that we can use the mask-based 512-bit blend
14068   // instructions.
14069   if (VT.getSizeInBits() == 512) {
14070     SDValue Cond = Op.getOperand(0);
14071     // The vNi1 condition case should be handled above as it can be trivially
14072     // lowered.
14073     assert(Cond.getValueType().getScalarSizeInBits() ==
14074                VT.getScalarSizeInBits() &&
14075            "Should have a size-matched integer condition!");
14076     // Build a mask by testing the condition against itself (tests for zero).
14077     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14078     SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14079     // Now return a new VSELECT using the mask.
14080     return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14081   }
14082
14083   // Only some types will be legal on some subtargets. If we can emit a legal
14084   // VSELECT-matching blend, return Op, and but if we need to expand, return
14085   // a null value.
14086   switch (VT.SimpleTy) {
14087   default:
14088     // Most of the vector types have blends past SSE4.1.
14089     return Op;
14090
14091   case MVT::v32i8:
14092     // The byte blends for AVX vectors were introduced only in AVX2.
14093     if (Subtarget.hasAVX2())
14094       return Op;
14095
14096     return SDValue();
14097
14098   case MVT::v8i16:
14099   case MVT::v16i16:
14100     // AVX-512 BWI and VLX features support VSELECT with i16 elements.
14101     if (Subtarget.hasBWI() && Subtarget.hasVLX())
14102       return Op;
14103
14104     // FIXME: We should custom lower this by fixing the condition and using i8
14105     // blends.
14106     return SDValue();
14107   }
14108 }
14109
14110 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14111   MVT VT = Op.getSimpleValueType();
14112   SDLoc dl(Op);
14113
14114   if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14115     return SDValue();
14116
14117   if (VT.getSizeInBits() == 8) {
14118     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14119                                   Op.getOperand(0), Op.getOperand(1));
14120     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14121                                   DAG.getValueType(VT));
14122     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14123   }
14124
14125   if (VT == MVT::f32) {
14126     // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14127     // the result back to FR32 register. It's only worth matching if the
14128     // result has a single use which is a store or a bitcast to i32.  And in
14129     // the case of a store, it's not worth it if the index is a constant 0,
14130     // because a MOVSSmr can be used instead, which is smaller and faster.
14131     if (!Op.hasOneUse())
14132       return SDValue();
14133     SDNode *User = *Op.getNode()->use_begin();
14134     if ((User->getOpcode() != ISD::STORE ||
14135          isNullConstant(Op.getOperand(1))) &&
14136         (User->getOpcode() != ISD::BITCAST ||
14137          User->getValueType(0) != MVT::i32))
14138       return SDValue();
14139     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14140                                   DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14141                                   Op.getOperand(1));
14142     return DAG.getBitcast(MVT::f32, Extract);
14143   }
14144
14145   if (VT == MVT::i32 || VT == MVT::i64) {
14146     // ExtractPS/pextrq works with constant index.
14147     if (isa<ConstantSDNode>(Op.getOperand(1)))
14148       return Op;
14149   }
14150
14151   return SDValue();
14152 }
14153
14154 /// Extract one bit from mask vector, like v16i1 or v8i1.
14155 /// AVX-512 feature.
14156 SDValue
14157 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14158   SDValue Vec = Op.getOperand(0);
14159   SDLoc dl(Vec);
14160   MVT VecVT = Vec.getSimpleValueType();
14161   SDValue Idx = Op.getOperand(1);
14162   MVT EltVT = Op.getSimpleValueType();
14163
14164   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14165          "Unexpected vector type in ExtractBitFromMaskVector");
14166
14167   // variable index can't be handled in mask registers,
14168   // extend vector to VR512/128
14169   if (!isa<ConstantSDNode>(Idx)) {
14170     unsigned NumElts = VecVT.getVectorNumElements();
14171     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14172     // than extending to 128/256bit.
14173     unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14174     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14175     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14176     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14177                               ExtVT.getVectorElementType(), Ext, Idx);
14178     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14179   }
14180
14181   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14182   if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14183       (VecVT.getVectorNumElements() < 8)) {
14184     // Use kshiftlw/rw instruction.
14185     VecVT = MVT::v16i1;
14186     Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14187                       DAG.getUNDEF(VecVT),
14188                       Vec,
14189                       DAG.getIntPtrConstant(0, dl));
14190   }
14191   unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14192   if (MaxSift - IdxVal)
14193     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14194                       DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14195   Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14196                     DAG.getConstant(MaxSift, dl, MVT::i8));
14197   return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14198                      DAG.getIntPtrConstant(0, dl));
14199 }
14200
14201 SDValue
14202 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14203                                            SelectionDAG &DAG) const {
14204   SDLoc dl(Op);
14205   SDValue Vec = Op.getOperand(0);
14206   MVT VecVT = Vec.getSimpleValueType();
14207   SDValue Idx = Op.getOperand(1);
14208
14209   if (VecVT.getVectorElementType() == MVT::i1)
14210     return ExtractBitFromMaskVector(Op, DAG);
14211
14212   if (!isa<ConstantSDNode>(Idx)) {
14213     // Its more profitable to go through memory (1 cycles throughput)
14214     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14215     // IACA tool was used to get performance estimation
14216     // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14217     //
14218     // example : extractelement <16 x i8> %a, i32 %i
14219     //
14220     // Block Throughput: 3.00 Cycles
14221     // Throughput Bottleneck: Port5
14222     //
14223     // | Num Of |   Ports pressure in cycles  |    |
14224     // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
14225     // ---------------------------------------------
14226     // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
14227     // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
14228     // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
14229     // Total Num Of Uops: 4
14230     //
14231     //
14232     // Block Throughput: 1.00 Cycles
14233     // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14234     //
14235     // |    |  Ports pressure in cycles   |  |
14236     // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
14237     // ---------------------------------------------------------
14238     // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14239     // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
14240     // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
14241     // Total Num Of Uops: 4
14242
14243     return SDValue();
14244   }
14245
14246   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14247
14248   // If this is a 256-bit vector result, first extract the 128-bit vector and
14249   // then extract the element from the 128-bit vector.
14250   if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14251     // Get the 128-bit vector.
14252     Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14253     MVT EltVT = VecVT.getVectorElementType();
14254
14255     unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14256     assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14257
14258     // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14259     // this can be done with a mask.
14260     IdxVal &= ElemsPerChunk - 1;
14261     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14262                        DAG.getConstant(IdxVal, dl, MVT::i32));
14263   }
14264
14265   assert(VecVT.is128BitVector() && "Unexpected vector length");
14266
14267   MVT VT = Op.getSimpleValueType();
14268
14269   if (VT.getSizeInBits() == 16) {
14270     // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14271     // we're going to zero extend the register or fold the store (SSE41 only).
14272     if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14273         !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14274       return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14275                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14276                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
14277
14278     // Transform it so it match pextrw which produces a 32-bit result.
14279     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14280                                   Op.getOperand(0), Op.getOperand(1));
14281     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14282                                   DAG.getValueType(VT));
14283     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14284   }
14285
14286   if (Subtarget.hasSSE41())
14287     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14288       return Res;
14289
14290   // TODO: We only extract a single element from v16i8, we can probably afford
14291   // to be more aggressive here before using the default approach of spilling to
14292   // stack.
14293   if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14294     // Extract either the lowest i32 or any i16, and extract the sub-byte.
14295     int DWordIdx = IdxVal / 4;
14296     if (DWordIdx == 0) {
14297       SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14298                                 DAG.getBitcast(MVT::v4i32, Vec),
14299                                 DAG.getIntPtrConstant(DWordIdx, dl));
14300       int ShiftVal = (IdxVal % 4) * 8;
14301       if (ShiftVal != 0)
14302         Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14303                           DAG.getConstant(ShiftVal, dl, MVT::i32));
14304       return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14305     }
14306
14307     int WordIdx = IdxVal / 2;
14308     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14309                               DAG.getBitcast(MVT::v8i16, Vec),
14310                               DAG.getIntPtrConstant(WordIdx, dl));
14311     int ShiftVal = (IdxVal % 2) * 8;
14312     if (ShiftVal != 0)
14313       Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14314                         DAG.getConstant(ShiftVal, dl, MVT::i16));
14315     return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14316   }
14317
14318   if (VT.getSizeInBits() == 32) {
14319     if (IdxVal == 0)
14320       return Op;
14321
14322     // SHUFPS the element to the lowest double word, then movss.
14323     int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14324     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14325     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14326                        DAG.getIntPtrConstant(0, dl));
14327   }
14328
14329   if (VT.getSizeInBits() == 64) {
14330     // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14331     // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14332     //        to match extract_elt for f64.
14333     if (IdxVal == 0)
14334       return Op;
14335
14336     // UNPCKHPD the element to the lowest double word, then movsd.
14337     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14338     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14339     int Mask[2] = { 1, -1 };
14340     Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14341     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14342                        DAG.getIntPtrConstant(0, dl));
14343   }
14344
14345   return SDValue();
14346 }
14347
14348 /// Insert one bit to mask vector, like v16i1 or v8i1.
14349 /// AVX-512 feature.
14350 SDValue
14351 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14352   SDLoc dl(Op);
14353   SDValue Vec = Op.getOperand(0);
14354   SDValue Elt = Op.getOperand(1);
14355   SDValue Idx = Op.getOperand(2);
14356   MVT VecVT = Vec.getSimpleValueType();
14357
14358   if (!isa<ConstantSDNode>(Idx)) {
14359     // Non constant index. Extend source and destination,
14360     // insert element and then truncate the result.
14361     MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
14362     MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
14363     SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14364       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14365       DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14366     return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14367   }
14368
14369   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14370   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14371   unsigned NumElems = VecVT.getVectorNumElements();
14372
14373   if(Vec.isUndef()) {
14374     if (IdxVal)
14375       EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14376                              DAG.getConstant(IdxVal, dl, MVT::i8));
14377     return EltInVec;
14378   }
14379
14380   // Insertion of one bit into first position
14381   if (IdxVal == 0 ) {
14382     // Clean top bits of vector.
14383     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14384                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
14385     EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14386                            DAG.getConstant(NumElems - 1, dl, MVT::i8));
14387     // Clean the first bit in source vector.
14388     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14389                       DAG.getConstant(1 , dl, MVT::i8));
14390     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14391                       DAG.getConstant(1, dl, MVT::i8));
14392
14393     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14394   }
14395   // Insertion of one bit into last position
14396   if (IdxVal == NumElems -1) {
14397     // Move the bit to the last position inside the vector.
14398     EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14399                            DAG.getConstant(IdxVal, dl, MVT::i8));
14400     // Clean the last bit in the source vector.
14401     Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14402                            DAG.getConstant(1, dl, MVT::i8));
14403     Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14404                            DAG.getConstant(1 , dl, MVT::i8));
14405
14406     return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14407   }
14408
14409   // Use shuffle to insert element.
14410   SmallVector<int, 64> MaskVec(NumElems);
14411   for (unsigned i = 0; i != NumElems; ++i)
14412     MaskVec[i] = (i == IdxVal) ? NumElems : i;
14413
14414   return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14415 }
14416
14417 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14418                                                   SelectionDAG &DAG) const {
14419   MVT VT = Op.getSimpleValueType();
14420   MVT EltVT = VT.getVectorElementType();
14421   unsigned NumElts = VT.getVectorNumElements();
14422
14423   if (EltVT == MVT::i1)
14424     return InsertBitToMaskVector(Op, DAG);
14425
14426   SDLoc dl(Op);
14427   SDValue N0 = Op.getOperand(0);
14428   SDValue N1 = Op.getOperand(1);
14429   SDValue N2 = Op.getOperand(2);
14430   if (!isa<ConstantSDNode>(N2))
14431     return SDValue();
14432   auto *N2C = cast<ConstantSDNode>(N2);
14433   unsigned IdxVal = N2C->getZExtValue();
14434
14435   bool IsZeroElt = X86::isZeroNode(N1);
14436   bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14437
14438   // If we are inserting a element, see if we can do this more efficiently with
14439   // a blend shuffle with a rematerializable vector than a costly integer
14440   // insertion.
14441   if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14442       16 <= EltVT.getSizeInBits()) {
14443     SmallVector<int, 8> BlendMask;
14444     for (unsigned i = 0; i != NumElts; ++i)
14445       BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14446     SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14447                                   : DAG.getConstant(-1, dl, VT);
14448     return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14449   }
14450
14451   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14452   // into that, and then insert the subvector back into the result.
14453   if (VT.is256BitVector() || VT.is512BitVector()) {
14454     // With a 256-bit vector, we can insert into the zero element efficiently
14455     // using a blend if we have AVX or AVX2 and the right data type.
14456     if (VT.is256BitVector() && IdxVal == 0) {
14457       // TODO: It is worthwhile to cast integer to floating point and back
14458       // and incur a domain crossing penalty if that's what we'll end up
14459       // doing anyway after extracting to a 128-bit vector.
14460       if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14461           (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14462         SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14463         N2 = DAG.getIntPtrConstant(1, dl);
14464         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14465       }
14466     }
14467
14468     // Get the desired 128-bit vector chunk.
14469     SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14470
14471     // Insert the element into the desired chunk.
14472     unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14473     assert(isPowerOf2_32(NumEltsIn128));
14474     // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14475     unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14476
14477     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14478                     DAG.getConstant(IdxIn128, dl, MVT::i32));
14479
14480     // Insert the changed part back into the bigger vector
14481     return insert128BitVector(N0, V, IdxVal, DAG, dl);
14482   }
14483   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14484
14485   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14486   // argument. SSE41 required for pinsrb.
14487   if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14488     unsigned Opc;
14489     if (VT == MVT::v8i16) {
14490       assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14491       Opc = X86ISD::PINSRW;
14492     } else {
14493       assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14494       assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14495       Opc = X86ISD::PINSRB;
14496     }
14497
14498     if (N1.getValueType() != MVT::i32)
14499       N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14500     if (N2.getValueType() != MVT::i32)
14501       N2 = DAG.getIntPtrConstant(IdxVal, dl);
14502     return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14503   }
14504
14505   if (Subtarget.hasSSE41()) {
14506     if (EltVT == MVT::f32) {
14507       // Bits [7:6] of the constant are the source select. This will always be
14508       //   zero here. The DAG Combiner may combine an extract_elt index into
14509       //   these bits. For example (insert (extract, 3), 2) could be matched by
14510       //   putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14511       // Bits [5:4] of the constant are the destination select. This is the
14512       //   value of the incoming immediate.
14513       // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14514       //   combine either bitwise AND or insert of float 0.0 to set these bits.
14515
14516       bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14517       if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14518         // If this is an insertion of 32-bits into the low 32-bits of
14519         // a vector, we prefer to generate a blend with immediate rather
14520         // than an insertps. Blends are simpler operations in hardware and so
14521         // will always have equal or better performance than insertps.
14522         // But if optimizing for size and there's a load folding opportunity,
14523         // generate insertps because blendps does not have a 32-bit memory
14524         // operand form.
14525         N2 = DAG.getIntPtrConstant(1, dl);
14526         N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14527         return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14528       }
14529       N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14530       // Create this as a scalar to vector..
14531       N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14532       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14533     }
14534
14535     // PINSR* works with constant index.
14536     if (EltVT == MVT::i32 || EltVT == MVT::i64)
14537       return Op;
14538   }
14539
14540   return SDValue();
14541 }
14542
14543 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14544                                      SelectionDAG &DAG) {
14545   SDLoc dl(Op);
14546   MVT OpVT = Op.getSimpleValueType();
14547
14548   // It's always cheaper to replace a xor+movd with xorps and simplifies further
14549   // combines.
14550   if (X86::isZeroNode(Op.getOperand(0)))
14551     return getZeroVector(OpVT, Subtarget, DAG, dl);
14552
14553   // If this is a 256-bit vector result, first insert into a 128-bit
14554   // vector and then insert into the 256-bit vector.
14555   if (!OpVT.is128BitVector()) {
14556     // Insert into a 128-bit vector.
14557     unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14558     MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14559                                  OpVT.getVectorNumElements() / SizeFactor);
14560
14561     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14562
14563     // Insert the 128-bit vector.
14564     return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14565   }
14566   assert(OpVT.is128BitVector() && "Expected an SSE type!");
14567
14568   // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14569   if (OpVT == MVT::v4i32)
14570     return Op;
14571
14572   SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14573   return DAG.getBitcast(
14574       OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14575 }
14576
14577 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
14578 // a simple subregister reference or explicit instructions to grab
14579 // upper bits of a vector.
14580 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14581                                       SelectionDAG &DAG) {
14582   assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14583
14584   SDLoc dl(Op);
14585   SDValue In =  Op.getOperand(0);
14586   SDValue Idx = Op.getOperand(1);
14587   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14588   MVT ResVT = Op.getSimpleValueType();
14589
14590   // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
14591   // would result with: v1i1 = extract_subvector(vXi1, idx).
14592   // Lower these into extract_vector_elt which is already selectable.
14593   if (ResVT == MVT::v1i1) {
14594     assert(Subtarget.hasAVX512() &&
14595            "Boolean EXTRACT_SUBVECTOR requires AVX512");
14596
14597     MVT EltVT = ResVT.getVectorElementType();
14598     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14599     MVT LegalVT =
14600         (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
14601     SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
14602     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
14603   }
14604
14605   assert((In.getSimpleValueType().is256BitVector() ||
14606           In.getSimpleValueType().is512BitVector()) &&
14607          "Can only extract from 256-bit or 512-bit vectors");
14608
14609   // If the input is a buildvector just emit a smaller one.
14610   unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14611   if (In.getOpcode() == ISD::BUILD_VECTOR)
14612     return DAG.getBuildVector(
14613         ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14614
14615   // Everything else is legal.
14616   return Op;
14617 }
14618
14619 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
14620 // simple superregister reference or explicit instructions to insert
14621 // the upper bits of a vector.
14622 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14623                                      SelectionDAG &DAG) {
14624   assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14625
14626   return insert1BitVector(Op, DAG, Subtarget);
14627 }
14628
14629 // Returns the appropriate wrapper opcode for a global reference.
14630 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14631   // References to absolute symbols are never PC-relative.
14632   if (GV && GV->isAbsoluteSymbolRef())
14633     return X86ISD::Wrapper;
14634
14635   CodeModel::Model M = getTargetMachine().getCodeModel();
14636   if (Subtarget.isPICStyleRIPRel() &&
14637       (M == CodeModel::Small || M == CodeModel::Kernel))
14638     return X86ISD::WrapperRIP;
14639
14640   return X86ISD::Wrapper;
14641 }
14642
14643 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14644 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14645 // one of the above mentioned nodes. It has to be wrapped because otherwise
14646 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14647 // be used to form addressing mode. These wrapped nodes will be selected
14648 // into MOV32ri.
14649 SDValue
14650 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14651   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14652
14653   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14654   // global base reg.
14655   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14656
14657   auto PtrVT = getPointerTy(DAG.getDataLayout());
14658   SDValue Result = DAG.getTargetConstantPool(
14659       CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14660   SDLoc DL(CP);
14661   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14662   // With PIC, the address is actually $g + Offset.
14663   if (OpFlag) {
14664     Result =
14665         DAG.getNode(ISD::ADD, DL, PtrVT,
14666                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14667   }
14668
14669   return Result;
14670 }
14671
14672 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14673   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14674
14675   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14676   // global base reg.
14677   unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14678
14679   auto PtrVT = getPointerTy(DAG.getDataLayout());
14680   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14681   SDLoc DL(JT);
14682   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14683
14684   // With PIC, the address is actually $g + Offset.
14685   if (OpFlag)
14686     Result =
14687         DAG.getNode(ISD::ADD, DL, PtrVT,
14688                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14689
14690   return Result;
14691 }
14692
14693 SDValue
14694 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14695   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14696
14697   // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14698   // global base reg.
14699   const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14700   unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14701
14702   auto PtrVT = getPointerTy(DAG.getDataLayout());
14703   SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14704
14705   SDLoc DL(Op);
14706   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14707
14708   // With PIC, the address is actually $g + Offset.
14709   if (isPositionIndependent() && !Subtarget.is64Bit()) {
14710     Result =
14711         DAG.getNode(ISD::ADD, DL, PtrVT,
14712                     DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14713   }
14714
14715   // For symbols that require a load from a stub to get the address, emit the
14716   // load.
14717   if (isGlobalStubReference(OpFlag))
14718     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14719                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14720
14721   return Result;
14722 }
14723
14724 SDValue
14725 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14726   // Create the TargetBlockAddressAddress node.
14727   unsigned char OpFlags =
14728     Subtarget.classifyBlockAddressReference();
14729   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14730   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14731   SDLoc dl(Op);
14732   auto PtrVT = getPointerTy(DAG.getDataLayout());
14733   SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14734   Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14735
14736   // With PIC, the address is actually $g + Offset.
14737   if (isGlobalRelativeToPICBase(OpFlags)) {
14738     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14739                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14740   }
14741
14742   return Result;
14743 }
14744
14745 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14746                                               const SDLoc &dl, int64_t Offset,
14747                                               SelectionDAG &DAG) const {
14748   // Create the TargetGlobalAddress node, folding in the constant
14749   // offset if it is legal.
14750   unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14751   CodeModel::Model M = DAG.getTarget().getCodeModel();
14752   auto PtrVT = getPointerTy(DAG.getDataLayout());
14753   SDValue Result;
14754   if (OpFlags == X86II::MO_NO_FLAG &&
14755       X86::isOffsetSuitableForCodeModel(Offset, M)) {
14756     // A direct static reference to a global.
14757     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14758     Offset = 0;
14759   } else {
14760     Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14761   }
14762
14763   Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14764
14765   // With PIC, the address is actually $g + Offset.
14766   if (isGlobalRelativeToPICBase(OpFlags)) {
14767     Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14768                          DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14769   }
14770
14771   // For globals that require a load from a stub to get the address, emit the
14772   // load.
14773   if (isGlobalStubReference(OpFlags))
14774     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14775                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14776
14777   // If there was a non-zero offset that we didn't fold, create an explicit
14778   // addition for it.
14779   if (Offset != 0)
14780     Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14781                          DAG.getConstant(Offset, dl, PtrVT));
14782
14783   return Result;
14784 }
14785
14786 SDValue
14787 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14788   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14789   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14790   return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14791 }
14792
14793 static SDValue
14794 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14795            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14796            unsigned char OperandFlags, bool LocalDynamic = false) {
14797   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14798   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14799   SDLoc dl(GA);
14800   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14801                                            GA->getValueType(0),
14802                                            GA->getOffset(),
14803                                            OperandFlags);
14804
14805   X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14806                                            : X86ISD::TLSADDR;
14807
14808   if (InFlag) {
14809     SDValue Ops[] = { Chain,  TGA, *InFlag };
14810     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14811   } else {
14812     SDValue Ops[]  = { Chain, TGA };
14813     Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14814   }
14815
14816   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14817   MFI.setAdjustsStack(true);
14818   MFI.setHasCalls(true);
14819
14820   SDValue Flag = Chain.getValue(1);
14821   return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14822 }
14823
14824 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14825 static SDValue
14826 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14827                                 const EVT PtrVT) {
14828   SDValue InFlag;
14829   SDLoc dl(GA);  // ? function entry point might be better
14830   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14831                                    DAG.getNode(X86ISD::GlobalBaseReg,
14832                                                SDLoc(), PtrVT), InFlag);
14833   InFlag = Chain.getValue(1);
14834
14835   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14836 }
14837
14838 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14839 static SDValue
14840 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14841                                 const EVT PtrVT) {
14842   return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14843                     X86::RAX, X86II::MO_TLSGD);
14844 }
14845
14846 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14847                                            SelectionDAG &DAG,
14848                                            const EVT PtrVT,
14849                                            bool is64Bit) {
14850   SDLoc dl(GA);
14851
14852   // Get the start address of the TLS block for this module.
14853   X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14854       .getInfo<X86MachineFunctionInfo>();
14855   MFI->incNumLocalDynamicTLSAccesses();
14856
14857   SDValue Base;
14858   if (is64Bit) {
14859     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14860                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
14861   } else {
14862     SDValue InFlag;
14863     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14864         DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14865     InFlag = Chain.getValue(1);
14866     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14867                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14868   }
14869
14870   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14871   // of Base.
14872
14873   // Build x@dtpoff.
14874   unsigned char OperandFlags = X86II::MO_DTPOFF;
14875   unsigned WrapperKind = X86ISD::Wrapper;
14876   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14877                                            GA->getValueType(0),
14878                                            GA->getOffset(), OperandFlags);
14879   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14880
14881   // Add x@dtpoff with the base.
14882   return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14883 }
14884
14885 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14886 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14887                                    const EVT PtrVT, TLSModel::Model model,
14888                                    bool is64Bit, bool isPIC) {
14889   SDLoc dl(GA);
14890
14891   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14892   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14893                                                          is64Bit ? 257 : 256));
14894
14895   SDValue ThreadPointer =
14896       DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14897                   MachinePointerInfo(Ptr));
14898
14899   unsigned char OperandFlags = 0;
14900   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
14901   // initialexec.
14902   unsigned WrapperKind = X86ISD::Wrapper;
14903   if (model == TLSModel::LocalExec) {
14904     OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14905   } else if (model == TLSModel::InitialExec) {
14906     if (is64Bit) {
14907       OperandFlags = X86II::MO_GOTTPOFF;
14908       WrapperKind = X86ISD::WrapperRIP;
14909     } else {
14910       OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14911     }
14912   } else {
14913     llvm_unreachable("Unexpected model");
14914   }
14915
14916   // emit "addl x@ntpoff,%eax" (local exec)
14917   // or "addl x@indntpoff,%eax" (initial exec)
14918   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14919   SDValue TGA =
14920       DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14921                                  GA->getOffset(), OperandFlags);
14922   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14923
14924   if (model == TLSModel::InitialExec) {
14925     if (isPIC && !is64Bit) {
14926       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14927                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14928                            Offset);
14929     }
14930
14931     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14932                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14933   }
14934
14935   // The address of the thread local variable is the add of the thread
14936   // pointer with the offset of the variable.
14937   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14938 }
14939
14940 SDValue
14941 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14942
14943   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14944
14945   if (DAG.getTarget().Options.EmulatedTLS)
14946     return LowerToTLSEmulatedModel(GA, DAG);
14947
14948   const GlobalValue *GV = GA->getGlobal();
14949   auto PtrVT = getPointerTy(DAG.getDataLayout());
14950   bool PositionIndependent = isPositionIndependent();
14951
14952   if (Subtarget.isTargetELF()) {
14953     TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14954     switch (model) {
14955       case TLSModel::GeneralDynamic:
14956         if (Subtarget.is64Bit())
14957           return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14958         return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14959       case TLSModel::LocalDynamic:
14960         return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14961                                            Subtarget.is64Bit());
14962       case TLSModel::InitialExec:
14963       case TLSModel::LocalExec:
14964         return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14965                                    PositionIndependent);
14966     }
14967     llvm_unreachable("Unknown TLS model.");
14968   }
14969
14970   if (Subtarget.isTargetDarwin()) {
14971     // Darwin only has one model of TLS.  Lower to that.
14972     unsigned char OpFlag = 0;
14973     unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14974                            X86ISD::WrapperRIP : X86ISD::Wrapper;
14975
14976     // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14977     // global base reg.
14978     bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14979     if (PIC32)
14980       OpFlag = X86II::MO_TLVP_PIC_BASE;
14981     else
14982       OpFlag = X86II::MO_TLVP;
14983     SDLoc DL(Op);
14984     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14985                                                 GA->getValueType(0),
14986                                                 GA->getOffset(), OpFlag);
14987     SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14988
14989     // With PIC32, the address is actually $g + Offset.
14990     if (PIC32)
14991       Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14992                            DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14993                            Offset);
14994
14995     // Lowering the machine isd will make sure everything is in the right
14996     // location.
14997     SDValue Chain = DAG.getEntryNode();
14998     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14999     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15000     SDValue Args[] = { Chain, Offset };
15001     Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15002     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15003                                DAG.getIntPtrConstant(0, DL, true),
15004                                Chain.getValue(1), DL);
15005
15006     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15007     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15008     MFI.setAdjustsStack(true);
15009
15010     // And our return value (tls address) is in the standard call return value
15011     // location.
15012     unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15013     return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15014   }
15015
15016   if (Subtarget.isTargetKnownWindowsMSVC() ||
15017       Subtarget.isTargetWindowsItanium() ||
15018       Subtarget.isTargetWindowsGNU()) {
15019     // Just use the implicit TLS architecture
15020     // Need to generate something similar to:
15021     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15022     //                                  ; from TEB
15023     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
15024     //   mov     rcx, qword [rdx+rcx*8]
15025     //   mov     eax, .tls$:tlsvar
15026     //   [rax+rcx] contains the address
15027     // Windows 64bit: gs:0x58
15028     // Windows 32bit: fs:__tls_array
15029
15030     SDLoc dl(GA);
15031     SDValue Chain = DAG.getEntryNode();
15032
15033     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15034     // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15035     // use its literal value of 0x2C.
15036     Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15037                                         ? Type::getInt8PtrTy(*DAG.getContext(),
15038                                                              256)
15039                                         : Type::getInt32PtrTy(*DAG.getContext(),
15040                                                               257));
15041
15042     SDValue TlsArray = Subtarget.is64Bit()
15043                            ? DAG.getIntPtrConstant(0x58, dl)
15044                            : (Subtarget.isTargetWindowsGNU()
15045                                   ? DAG.getIntPtrConstant(0x2C, dl)
15046                                   : DAG.getExternalSymbol("_tls_array", PtrVT));
15047
15048     SDValue ThreadPointer =
15049         DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15050
15051     SDValue res;
15052     if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15053       res = ThreadPointer;
15054     } else {
15055       // Load the _tls_index variable
15056       SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15057       if (Subtarget.is64Bit())
15058         IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15059                              MachinePointerInfo(), MVT::i32);
15060       else
15061         IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15062
15063       auto &DL = DAG.getDataLayout();
15064       SDValue Scale =
15065           DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15066       IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15067
15068       res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15069     }
15070
15071     res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15072
15073     // Get the offset of start of .tls section
15074     SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15075                                              GA->getValueType(0),
15076                                              GA->getOffset(), X86II::MO_SECREL);
15077     SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15078
15079     // The address of the thread local variable is the add of the thread
15080     // pointer with the offset of the variable.
15081     return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15082   }
15083
15084   llvm_unreachable("TLS not implemented for this target.");
15085 }
15086
15087 /// Lower SRA_PARTS and friends, which return two i32 values
15088 /// and take a 2 x i32 value to shift plus a shift amount.
15089 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15090   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15091   MVT VT = Op.getSimpleValueType();
15092   unsigned VTBits = VT.getSizeInBits();
15093   SDLoc dl(Op);
15094   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15095   SDValue ShOpLo = Op.getOperand(0);
15096   SDValue ShOpHi = Op.getOperand(1);
15097   SDValue ShAmt  = Op.getOperand(2);
15098   // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15099   // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15100   // during isel.
15101   SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15102                                   DAG.getConstant(VTBits - 1, dl, MVT::i8));
15103   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15104                                      DAG.getConstant(VTBits - 1, dl, MVT::i8))
15105                        : DAG.getConstant(0, dl, VT);
15106
15107   SDValue Tmp2, Tmp3;
15108   if (Op.getOpcode() == ISD::SHL_PARTS) {
15109     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15110     Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15111   } else {
15112     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15113     Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15114   }
15115
15116   // If the shift amount is larger or equal than the width of a part we can't
15117   // rely on the results of shld/shrd. Insert a test and select the appropriate
15118   // values for large shift amounts.
15119   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15120                                 DAG.getConstant(VTBits, dl, MVT::i8));
15121   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15122                              AndNode, DAG.getConstant(0, dl, MVT::i8));
15123
15124   SDValue Hi, Lo;
15125   SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15126   SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15127   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15128
15129   if (Op.getOpcode() == ISD::SHL_PARTS) {
15130     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15131     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15132   } else {
15133     Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15134     Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15135   }
15136
15137   SDValue Ops[2] = { Lo, Hi };
15138   return DAG.getMergeValues(Ops, dl);
15139 }
15140
15141 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15142                                            SelectionDAG &DAG) const {
15143   SDValue Src = Op.getOperand(0);
15144   MVT SrcVT = Src.getSimpleValueType();
15145   MVT VT = Op.getSimpleValueType();
15146   SDLoc dl(Op);
15147
15148   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15149   if (SrcVT.isVector()) {
15150     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15151       return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15152                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15153                                      DAG.getUNDEF(SrcVT)));
15154     }
15155     if (SrcVT.getVectorElementType() == MVT::i1) {
15156       if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15157         return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15158                            DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15159       MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15160       return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15161                          DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15162     }
15163     return SDValue();
15164   }
15165
15166   assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15167          "Unknown SINT_TO_FP to lower!");
15168
15169   // These are really Legal; return the operand so the caller accepts it as
15170   // Legal.
15171   if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15172     return Op;
15173   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15174       Subtarget.is64Bit()) {
15175     return Op;
15176   }
15177
15178   SDValue ValueToStore = Op.getOperand(0);
15179   if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15180       !Subtarget.is64Bit())
15181     // Bitcasting to f64 here allows us to do a single 64-bit store from
15182     // an SSE register, avoiding the store forwarding penalty that would come
15183     // with two 32-bit stores.
15184     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15185
15186   unsigned Size = SrcVT.getSizeInBits()/8;
15187   MachineFunction &MF = DAG.getMachineFunction();
15188   auto PtrVT = getPointerTy(MF.getDataLayout());
15189   int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15190   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15191   SDValue Chain = DAG.getStore(
15192       DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15193       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15194   return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15195 }
15196
15197 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15198                                      SDValue StackSlot,
15199                                      SelectionDAG &DAG) const {
15200   // Build the FILD
15201   SDLoc DL(Op);
15202   SDVTList Tys;
15203   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15204   if (useSSE)
15205     Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15206   else
15207     Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15208
15209   unsigned ByteSize = SrcVT.getSizeInBits()/8;
15210
15211   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15212   MachineMemOperand *MMO;
15213   if (FI) {
15214     int SSFI = FI->getIndex();
15215     MMO = DAG.getMachineFunction().getMachineMemOperand(
15216         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15217         MachineMemOperand::MOLoad, ByteSize, ByteSize);
15218   } else {
15219     MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15220     StackSlot = StackSlot.getOperand(1);
15221   }
15222   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15223   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15224                                            X86ISD::FILD, DL,
15225                                            Tys, Ops, SrcVT, MMO);
15226
15227   if (useSSE) {
15228     Chain = Result.getValue(1);
15229     SDValue InFlag = Result.getValue(2);
15230
15231     // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15232     // shouldn't be necessary except that RFP cannot be live across
15233     // multiple blocks. When stackifier is fixed, they can be uncoupled.
15234     MachineFunction &MF = DAG.getMachineFunction();
15235     unsigned SSFISize = Op.getValueSizeInBits()/8;
15236     int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15237     auto PtrVT = getPointerTy(MF.getDataLayout());
15238     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15239     Tys = DAG.getVTList(MVT::Other);
15240     SDValue Ops[] = {
15241       Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15242     };
15243     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15244         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15245         MachineMemOperand::MOStore, SSFISize, SSFISize);
15246
15247     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15248                                     Ops, Op.getValueType(), MMO);
15249     Result = DAG.getLoad(
15250         Op.getValueType(), DL, Chain, StackSlot,
15251         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15252   }
15253
15254   return Result;
15255 }
15256
15257 /// 64-bit unsigned integer to double expansion.
15258 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15259                                                SelectionDAG &DAG) const {
15260   // This algorithm is not obvious. Here it is what we're trying to output:
15261   /*
15262      movq       %rax,  %xmm0
15263      punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15264      subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15265      #ifdef __SSE3__
15266        haddpd   %xmm0, %xmm0
15267      #else
15268        pshufd   $0x4e, %xmm0, %xmm1
15269        addpd    %xmm1, %xmm0
15270      #endif
15271   */
15272
15273   SDLoc dl(Op);
15274   LLVMContext *Context = DAG.getContext();
15275
15276   // Build some magic constants.
15277   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15278   Constant *C0 = ConstantDataVector::get(*Context, CV0);
15279   auto PtrVT = getPointerTy(DAG.getDataLayout());
15280   SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15281
15282   SmallVector<Constant*,2> CV1;
15283   CV1.push_back(
15284     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15285                                       APInt(64, 0x4330000000000000ULL))));
15286   CV1.push_back(
15287     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15288                                       APInt(64, 0x4530000000000000ULL))));
15289   Constant *C1 = ConstantVector::get(CV1);
15290   SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15291
15292   // Load the 64-bit value into an XMM register.
15293   SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15294                             Op.getOperand(0));
15295   SDValue CLod0 =
15296       DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15297                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15298                   /* Alignment = */ 16);
15299   SDValue Unpck1 =
15300       getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15301
15302   SDValue CLod1 =
15303       DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15304                   MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15305                   /* Alignment = */ 16);
15306   SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15307   // TODO: Are there any fast-math-flags to propagate here?
15308   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15309   SDValue Result;
15310
15311   if (Subtarget.hasSSE3()) {
15312     // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15313     Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15314   } else {
15315     SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15316     SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15317     Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15318                          DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15319   }
15320
15321   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15322                      DAG.getIntPtrConstant(0, dl));
15323 }
15324
15325 /// 32-bit unsigned integer to float expansion.
15326 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15327                                                SelectionDAG &DAG) const {
15328   SDLoc dl(Op);
15329   // FP constant to bias correct the final result.
15330   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15331                                    MVT::f64);
15332
15333   // Load the 32-bit value into an XMM register.
15334   SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15335                              Op.getOperand(0));
15336
15337   // Zero out the upper parts of the register.
15338   Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15339
15340   Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15341                      DAG.getBitcast(MVT::v2f64, Load),
15342                      DAG.getIntPtrConstant(0, dl));
15343
15344   // Or the load with the bias.
15345   SDValue Or = DAG.getNode(
15346       ISD::OR, dl, MVT::v2i64,
15347       DAG.getBitcast(MVT::v2i64,
15348                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15349       DAG.getBitcast(MVT::v2i64,
15350                      DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15351   Or =
15352       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15353                   DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15354
15355   // Subtract the bias.
15356   // TODO: Are there any fast-math-flags to propagate here?
15357   SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15358
15359   // Handle final rounding.
15360   MVT DestVT = Op.getSimpleValueType();
15361
15362   if (DestVT.bitsLT(MVT::f64))
15363     return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15364                        DAG.getIntPtrConstant(0, dl));
15365   if (DestVT.bitsGT(MVT::f64))
15366     return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15367
15368   // Handle final rounding.
15369   return Sub;
15370 }
15371
15372 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15373                                      const X86Subtarget &Subtarget, SDLoc &DL) {
15374   if (Op.getSimpleValueType() != MVT::v2f64)
15375     return SDValue();
15376
15377   SDValue N0 = Op.getOperand(0);
15378   assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15379
15380   // Legalize to v4i32 type.
15381   N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15382                    DAG.getUNDEF(MVT::v2i32));
15383
15384   if (Subtarget.hasAVX512())
15385     return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15386
15387   // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15388   // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15389   SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15390   SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15391
15392   // Two to the power of half-word-size.
15393   SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15394
15395   // Clear upper part of LO, lower HI.
15396   SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15397   SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15398
15399   SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15400           fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15401   SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15402
15403   // Add the two halves.
15404   return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15405 }
15406
15407 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15408                                      const X86Subtarget &Subtarget) {
15409   // The algorithm is the following:
15410   // #ifdef __SSE4_1__
15411   //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15412   //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15413   //                                 (uint4) 0x53000000, 0xaa);
15414   // #else
15415   //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15416   //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15417   // #endif
15418   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15419   //     return (float4) lo + fhi;
15420
15421   // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15422   // reassociate the two FADDs, and if we do that, the algorithm fails
15423   // spectacularly (PR24512).
15424   // FIXME: If we ever have some kind of Machine FMF, this should be marked
15425   // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15426   // there's also the MachineCombiner reassociations happening on Machine IR.
15427   if (DAG.getTarget().Options.UnsafeFPMath)
15428     return SDValue();
15429
15430   SDLoc DL(Op);
15431   SDValue V = Op->getOperand(0);
15432   MVT VecIntVT = V.getSimpleValueType();
15433   bool Is128 = VecIntVT == MVT::v4i32;
15434   MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15435   // If we convert to something else than the supported type, e.g., to v4f64,
15436   // abort early.
15437   if (VecFloatVT != Op->getSimpleValueType(0))
15438     return SDValue();
15439
15440   assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15441          "Unsupported custom type");
15442
15443   // In the #idef/#else code, we have in common:
15444   // - The vector of constants:
15445   // -- 0x4b000000
15446   // -- 0x53000000
15447   // - A shift:
15448   // -- v >> 16
15449
15450   // Create the splat vector for 0x4b000000.
15451   SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15452   // Create the splat vector for 0x53000000.
15453   SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15454
15455   // Create the right shift.
15456   SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15457   SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15458
15459   SDValue Low, High;
15460   if (Subtarget.hasSSE41()) {
15461     MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15462     //     uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15463     SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15464     SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15465     // Low will be bitcasted right away, so do not bother bitcasting back to its
15466     // original type.
15467     Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15468                       VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15469     //     uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15470     //                                 (uint4) 0x53000000, 0xaa);
15471     SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15472     SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15473     // High will be bitcasted right away, so do not bother bitcasting back to
15474     // its original type.
15475     High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15476                        VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15477   } else {
15478     SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15479     //     uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15480     SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15481     Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15482
15483     //     uint4 hi = (v >> 16) | (uint4) 0x53000000;
15484     High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15485   }
15486
15487   // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15488   SDValue VecCstFAdd = DAG.getConstantFP(
15489       APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15490
15491   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15492   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15493   // TODO: Are there any fast-math-flags to propagate here?
15494   SDValue FHigh =
15495       DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15496   //     return (float4) lo + fhi;
15497   SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15498   return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15499 }
15500
15501 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15502                                                SelectionDAG &DAG) const {
15503   SDValue N0 = Op.getOperand(0);
15504   MVT SrcVT = N0.getSimpleValueType();
15505   SDLoc dl(Op);
15506
15507   if (SrcVT.getVectorElementType() == MVT::i1) {
15508     if (SrcVT == MVT::v2i1)
15509       return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15510                          DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15511     MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15512     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15513                        DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15514   }
15515
15516   switch (SrcVT.SimpleTy) {
15517   default:
15518     llvm_unreachable("Custom UINT_TO_FP is not supported!");
15519   case MVT::v4i8:
15520   case MVT::v4i16:
15521   case MVT::v8i8:
15522   case MVT::v8i16: {
15523     MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15524     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15525                        DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15526   }
15527   case MVT::v2i32:
15528     return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15529   case MVT::v4i32:
15530   case MVT::v8i32:
15531     return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15532   case MVT::v16i8:
15533   case MVT::v16i16:
15534     assert(Subtarget.hasAVX512());
15535     return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15536                        DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15537   }
15538 }
15539
15540 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15541                                            SelectionDAG &DAG) const {
15542   SDValue N0 = Op.getOperand(0);
15543   SDLoc dl(Op);
15544   auto PtrVT = getPointerTy(DAG.getDataLayout());
15545
15546   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15547   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15548   // the optimization here.
15549   if (DAG.SignBitIsZero(N0))
15550     return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15551
15552   if (Op.getSimpleValueType().isVector())
15553     return lowerUINT_TO_FP_vec(Op, DAG);
15554
15555   MVT SrcVT = N0.getSimpleValueType();
15556   MVT DstVT = Op.getSimpleValueType();
15557
15558   if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15559       (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15560     // Conversions from unsigned i32 to f32/f64 are legal,
15561     // using VCVTUSI2SS/SD.  Same for i64 in 64-bit mode.
15562     return Op;
15563   }
15564
15565   if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15566     return LowerUINT_TO_FP_i64(Op, DAG);
15567   if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15568     return LowerUINT_TO_FP_i32(Op, DAG);
15569   if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15570     return SDValue();
15571
15572   // Make a 64-bit buffer, and use it to build an FILD.
15573   SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15574   if (SrcVT == MVT::i32) {
15575     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15576     SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15577                                   StackSlot, MachinePointerInfo());
15578     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15579                                   OffsetSlot, MachinePointerInfo());
15580     SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15581     return Fild;
15582   }
15583
15584   assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15585   SDValue ValueToStore = Op.getOperand(0);
15586   if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15587     // Bitcasting to f64 here allows us to do a single 64-bit store from
15588     // an SSE register, avoiding the store forwarding penalty that would come
15589     // with two 32-bit stores.
15590     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15591   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15592                                MachinePointerInfo());
15593   // For i64 source, we need to add the appropriate power of 2 if the input
15594   // was negative.  This is the same as the optimization in
15595   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15596   // we must be careful to do the computation in x87 extended precision, not
15597   // in SSE. (The generic code can't know it's OK to do this, or how to.)
15598   int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15599   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15600       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15601       MachineMemOperand::MOLoad, 8, 8);
15602
15603   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15604   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15605   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15606                                          MVT::i64, MMO);
15607
15608   APInt FF(32, 0x5F800000ULL);
15609
15610   // Check whether the sign bit is set.
15611   SDValue SignSet = DAG.getSetCC(
15612       dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15613       Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15614
15615   // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15616   SDValue FudgePtr = DAG.getConstantPool(
15617       ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15618
15619   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15620   SDValue Zero = DAG.getIntPtrConstant(0, dl);
15621   SDValue Four = DAG.getIntPtrConstant(4, dl);
15622   SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15623   FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15624
15625   // Load the value out, extending it from f32 to f80.
15626   // FIXME: Avoid the extend by constructing the right constant pool?
15627   SDValue Fudge = DAG.getExtLoad(
15628       ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15629       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15630       /* Alignment = */ 4);
15631   // Extend everything to 80 bits to force it to be done on x87.
15632   // TODO: Are there any fast-math-flags to propagate here?
15633   SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15634   return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15635                      DAG.getIntPtrConstant(0, dl));
15636 }
15637
15638 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15639 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15640 // just return an <SDValue(), SDValue()> pair.
15641 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15642 // to i16, i32 or i64, and we lower it to a legal sequence.
15643 // If lowered to the final integer result we return a <result, SDValue()> pair.
15644 // Otherwise we lower it to a sequence ending with a FIST, return a
15645 // <FIST, StackSlot> pair, and the caller is responsible for loading
15646 // the final integer result from StackSlot.
15647 std::pair<SDValue,SDValue>
15648 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15649                                    bool IsSigned, bool IsReplace) const {
15650   SDLoc DL(Op);
15651
15652   EVT DstTy = Op.getValueType();
15653   EVT TheVT = Op.getOperand(0).getValueType();
15654   auto PtrVT = getPointerTy(DAG.getDataLayout());
15655
15656   if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15657     // f16 must be promoted before using the lowering in this routine.
15658     // fp128 does not use this lowering.
15659     return std::make_pair(SDValue(), SDValue());
15660   }
15661
15662   // If using FIST to compute an unsigned i64, we'll need some fixup
15663   // to handle values above the maximum signed i64.  A FIST is always
15664   // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15665   bool UnsignedFixup = !IsSigned &&
15666                        DstTy == MVT::i64 &&
15667                        (!Subtarget.is64Bit() ||
15668                         !isScalarFPTypeInSSEReg(TheVT));
15669
15670   if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15671     // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15672     // The low 32 bits of the fist result will have the correct uint32 result.
15673     assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15674     DstTy = MVT::i64;
15675   }
15676
15677   assert(DstTy.getSimpleVT() <= MVT::i64 &&
15678          DstTy.getSimpleVT() >= MVT::i16 &&
15679          "Unknown FP_TO_INT to lower!");
15680
15681   // These are really Legal.
15682   if (DstTy == MVT::i32 &&
15683       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15684     return std::make_pair(SDValue(), SDValue());
15685   if (Subtarget.is64Bit() &&
15686       DstTy == MVT::i64 &&
15687       isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15688     return std::make_pair(SDValue(), SDValue());
15689
15690   // We lower FP->int64 into FISTP64 followed by a load from a temporary
15691   // stack slot.
15692   MachineFunction &MF = DAG.getMachineFunction();
15693   unsigned MemSize = DstTy.getSizeInBits()/8;
15694   int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15695   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15696
15697   unsigned Opc;
15698   switch (DstTy.getSimpleVT().SimpleTy) {
15699   default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15700   case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15701   case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15702   case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15703   }
15704
15705   SDValue Chain = DAG.getEntryNode();
15706   SDValue Value = Op.getOperand(0);
15707   SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15708
15709   if (UnsignedFixup) {
15710     //
15711     // Conversion to unsigned i64 is implemented with a select,
15712     // depending on whether the source value fits in the range
15713     // of a signed i64.  Let Thresh be the FP equivalent of
15714     // 0x8000000000000000ULL.
15715     //
15716     //  Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15717     //  FistSrc    = (Value < Thresh) ? Value : (Value - Thresh);
15718     //  Fist-to-mem64 FistSrc
15719     //  Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15720     //  to XOR'ing the high 32 bits with Adjust.
15721     //
15722     // Being a power of 2, Thresh is exactly representable in all FP formats.
15723     // For X87 we'd like to use the smallest FP type for this constant, but
15724     // for DAG type consistency we have to match the FP operand type.
15725
15726     APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15727     LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15728     bool LosesInfo = false;
15729     if (TheVT == MVT::f64)
15730       // The rounding mode is irrelevant as the conversion should be exact.
15731       Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15732                               &LosesInfo);
15733     else if (TheVT == MVT::f80)
15734       Status = Thresh.convert(APFloat::x87DoubleExtended(),
15735                               APFloat::rmNearestTiesToEven, &LosesInfo);
15736
15737     assert(Status == APFloat::opOK && !LosesInfo &&
15738            "FP conversion should have been exact");
15739
15740     SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15741
15742     SDValue Cmp = DAG.getSetCC(DL,
15743                                getSetCCResultType(DAG.getDataLayout(),
15744                                                   *DAG.getContext(), TheVT),
15745                                Value, ThreshVal, ISD::SETLT);
15746     Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15747                            DAG.getConstant(0, DL, MVT::i32),
15748                            DAG.getConstant(0x80000000, DL, MVT::i32));
15749     SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15750     Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15751                                               *DAG.getContext(), TheVT),
15752                        Value, ThreshVal, ISD::SETLT);
15753     Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15754   }
15755
15756   // FIXME This causes a redundant load/store if the SSE-class value is already
15757   // in memory, such as if it is on the callstack.
15758   if (isScalarFPTypeInSSEReg(TheVT)) {
15759     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15760     Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15761                          MachinePointerInfo::getFixedStack(MF, SSFI));
15762     SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15763     SDValue Ops[] = {
15764       Chain, StackSlot, DAG.getValueType(TheVT)
15765     };
15766
15767     MachineMemOperand *MMO =
15768         MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15769                                 MachineMemOperand::MOLoad, MemSize, MemSize);
15770     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15771     Chain = Value.getValue(1);
15772     SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15773     StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15774   }
15775
15776   MachineMemOperand *MMO =
15777       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15778                               MachineMemOperand::MOStore, MemSize, MemSize);
15779
15780   if (UnsignedFixup) {
15781
15782     // Insert the FIST, load its result as two i32's,
15783     // and XOR the high i32 with Adjust.
15784
15785     SDValue FistOps[] = { Chain, Value, StackSlot };
15786     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15787                                            FistOps, DstTy, MMO);
15788
15789     SDValue Low32 =
15790         DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15791     SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15792
15793     SDValue High32 =
15794         DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15795     High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15796
15797     if (Subtarget.is64Bit()) {
15798       // Join High32 and Low32 into a 64-bit result.
15799       // (High32 << 32) | Low32
15800       Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15801       High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15802       High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15803                            DAG.getConstant(32, DL, MVT::i8));
15804       SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15805       return std::make_pair(Result, SDValue());
15806     }
15807
15808     SDValue ResultOps[] = { Low32, High32 };
15809
15810     SDValue pair = IsReplace
15811       ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15812       : DAG.getMergeValues(ResultOps, DL);
15813     return std::make_pair(pair, SDValue());
15814   } else {
15815     // Build the FP_TO_INT*_IN_MEM
15816     SDValue Ops[] = { Chain, Value, StackSlot };
15817     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15818                                            Ops, DstTy, MMO);
15819     return std::make_pair(FIST, StackSlot);
15820   }
15821 }
15822
15823 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15824                               const X86Subtarget &Subtarget) {
15825   MVT VT = Op->getSimpleValueType(0);
15826   SDValue In = Op->getOperand(0);
15827   MVT InVT = In.getSimpleValueType();
15828   SDLoc dl(Op);
15829
15830   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15831     return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15832
15833   // Optimize vectors in AVX mode:
15834   //
15835   //   v8i16 -> v8i32
15836   //   Use vpunpcklwd for 4 lower elements  v8i16 -> v4i32.
15837   //   Use vpunpckhwd for 4 upper elements  v8i16 -> v4i32.
15838   //   Concat upper and lower parts.
15839   //
15840   //   v4i32 -> v4i64
15841   //   Use vpunpckldq for 4 lower elements  v4i32 -> v2i64.
15842   //   Use vpunpckhdq for 4 upper elements  v4i32 -> v2i64.
15843   //   Concat upper and lower parts.
15844   //
15845
15846   if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15847       ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15848       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15849     return SDValue();
15850
15851   if (Subtarget.hasInt256())
15852     return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15853
15854   SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15855   SDValue Undef = DAG.getUNDEF(InVT);
15856   bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15857   SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15858   SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15859
15860   MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15861                              VT.getVectorNumElements()/2);
15862
15863   OpLo = DAG.getBitcast(HVT, OpLo);
15864   OpHi = DAG.getBitcast(HVT, OpHi);
15865
15866   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15867 }
15868
15869 static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15870                   const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15871   MVT VT = Op->getSimpleValueType(0);
15872   SDValue In = Op->getOperand(0);
15873   MVT InVT = In.getSimpleValueType();
15874   SDLoc DL(Op);
15875   unsigned NumElts = VT.getVectorNumElements();
15876
15877   if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15878       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15879     return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15880
15881   if (InVT.getVectorElementType() != MVT::i1)
15882     return SDValue();
15883
15884   // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15885   MVT ExtVT = VT;
15886   if (!VT.is512BitVector() && !Subtarget.hasVLX())
15887     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15888
15889   SDValue One =
15890    DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15891   SDValue Zero =
15892    DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15893
15894   SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15895   if (VT == ExtVT)
15896     return SelectedVal;
15897   return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15898 }
15899
15900 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15901                                SelectionDAG &DAG) {
15902   if (Subtarget.hasFp256())
15903     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15904       return Res;
15905
15906   return SDValue();
15907 }
15908
15909 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15910                                 SelectionDAG &DAG) {
15911   SDLoc DL(Op);
15912   MVT VT = Op.getSimpleValueType();
15913   SDValue In = Op.getOperand(0);
15914   MVT SVT = In.getSimpleValueType();
15915
15916   if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15917     return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15918
15919   if (Subtarget.hasFp256())
15920     if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15921       return Res;
15922
15923   assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15924          VT.getVectorNumElements() != SVT.getVectorNumElements());
15925   return SDValue();
15926 }
15927
15928 /// Helper to recursively truncate vector elements in half with PACKSS.
15929 /// It makes use of the fact that vector comparison results will be all-zeros
15930 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15931 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15932 /// within each 128-bit lane.
15933 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15934                                                const SDLoc &DL,
15935                                                SelectionDAG &DAG,
15936                                                const X86Subtarget &Subtarget) {
15937   // Requires SSE2 but AVX512 has fast truncate.
15938   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15939     return SDValue();
15940
15941   EVT SrcVT = In.getValueType();
15942
15943   // No truncation required, we might get here due to recursive calls.
15944   if (SrcVT == DstVT)
15945     return In;
15946
15947   // We only support vector truncation to 128bits or greater from a
15948   // 256bits or greater source.
15949   if ((DstVT.getSizeInBits() % 128) != 0)
15950     return SDValue();
15951   if ((SrcVT.getSizeInBits() % 256) != 0)
15952     return SDValue();
15953
15954   unsigned NumElems = SrcVT.getVectorNumElements();
15955   assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15956   assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15957
15958   EVT PackedSVT =
15959       EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15960
15961   // Extract lower/upper subvectors.
15962   unsigned NumSubElts = NumElems / 2;
15963   unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15964   SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15965   SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15966
15967   // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15968   if (SrcVT.is256BitVector()) {
15969     Lo = DAG.getBitcast(MVT::v8i16, Lo);
15970     Hi = DAG.getBitcast(MVT::v8i16, Hi);
15971     SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15972     return DAG.getBitcast(DstVT, Res);
15973   }
15974
15975   // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15976   // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15977   if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15978     Lo = DAG.getBitcast(MVT::v16i16, Lo);
15979     Hi = DAG.getBitcast(MVT::v16i16, Hi);
15980     SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15981
15982     // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15983     // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15984     Res = DAG.getBitcast(MVT::v4i64, Res);
15985     Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15986
15987     if (DstVT.is256BitVector())
15988       return DAG.getBitcast(DstVT, Res);
15989
15990     // If 512bit -> 128bit truncate another stage.
15991     EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15992     Res = DAG.getBitcast(PackedVT, Res);
15993     return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15994   }
15995
15996   // Recursively pack lower/upper subvectors, concat result and pack again.
15997   assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15998   EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15999   Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
16000   Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
16001
16002   PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
16003   SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16004   return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
16005 }
16006
16007 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16008                                   const X86Subtarget &Subtarget) {
16009
16010   SDLoc DL(Op);
16011   MVT VT = Op.getSimpleValueType();
16012   SDValue In = Op.getOperand(0);
16013   MVT InVT = In.getSimpleValueType();
16014
16015   assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16016
16017   // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16018   unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16019   if (InVT.getScalarSizeInBits() <= 16) {
16020     if (Subtarget.hasBWI()) {
16021       // legal, will go to VPMOVB2M, VPMOVW2M
16022       // Shift packed bytes not supported natively, bitcast to word
16023       MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16024       SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16025                                        DAG.getBitcast(ExtVT, In),
16026                                        DAG.getConstant(ShiftInx, DL, ExtVT));
16027       ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16028       return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16029     }
16030     // Use TESTD/Q, extended vector to packed dword/qword.
16031     assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16032            "Unexpected vector type.");
16033     unsigned NumElts = InVT.getVectorNumElements();
16034     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16035     In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16036     InVT = ExtVT;
16037     ShiftInx = InVT.getScalarSizeInBits() - 1;
16038   }
16039
16040   SDValue  ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16041                                    DAG.getConstant(ShiftInx, DL, InVT));
16042   return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16043 }
16044
16045 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16046   SDLoc DL(Op);
16047   MVT VT = Op.getSimpleValueType();
16048   SDValue In = Op.getOperand(0);
16049   MVT InVT = In.getSimpleValueType();
16050
16051   if (VT == MVT::i1) {
16052     assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
16053            "Invalid scalar TRUNCATE operation");
16054     if (InVT.getSizeInBits() >= 32)
16055       return SDValue();
16056     In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
16057     return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16058   }
16059   assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16060          "Invalid TRUNCATE operation");
16061
16062   if (VT.getVectorElementType() == MVT::i1)
16063     return LowerTruncateVecI1(Op, DAG, Subtarget);
16064
16065   // vpmovqb/w/d, vpmovdb/w, vpmovwb
16066   if (Subtarget.hasAVX512()) {
16067     // word to byte only under BWI
16068     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16069       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16070                          getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16071     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16072   }
16073
16074   // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
16075   if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
16076     if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
16077       return V;
16078
16079   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16080     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16081     if (Subtarget.hasInt256()) {
16082       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16083       In = DAG.getBitcast(MVT::v8i32, In);
16084       In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16085       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16086                          DAG.getIntPtrConstant(0, DL));
16087     }
16088
16089     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16090                                DAG.getIntPtrConstant(0, DL));
16091     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16092                                DAG.getIntPtrConstant(2, DL));
16093     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16094     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16095     static const int ShufMask[] = {0, 2, 4, 6};
16096     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16097   }
16098
16099   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16100     // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16101     if (Subtarget.hasInt256()) {
16102       In = DAG.getBitcast(MVT::v32i8, In);
16103
16104       // The PSHUFB mask:
16105       static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
16106                                       -1, -1, -1, -1, -1, -1, -1, -1,
16107                                       16, 17, 20, 21, 24, 25, 28, 29,
16108                                       -1, -1, -1, -1, -1, -1, -1, -1 };
16109       In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16110       In = DAG.getBitcast(MVT::v4i64, In);
16111
16112       static const int ShufMask2[] = {0,  2,  -1,  -1};
16113       In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
16114       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16115                        DAG.getIntPtrConstant(0, DL));
16116       return DAG.getBitcast(VT, In);
16117     }
16118
16119     SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16120                                DAG.getIntPtrConstant(0, DL));
16121
16122     SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16123                                DAG.getIntPtrConstant(4, DL));
16124
16125     OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16126     OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16127
16128     // The PSHUFB mask:
16129     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
16130                                    -1, -1, -1, -1, -1, -1, -1, -1};
16131
16132     OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16133     OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16134
16135     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16136     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16137
16138     // The MOVLHPS Mask:
16139     static const int ShufMask2[] = {0, 1, 4, 5};
16140     SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16141     return DAG.getBitcast(MVT::v8i16, res);
16142   }
16143
16144   // Handle truncation of V256 to V128 using shuffles.
16145   if (!VT.is128BitVector() || !InVT.is256BitVector())
16146     return SDValue();
16147
16148   assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16149
16150   unsigned NumElems = VT.getVectorNumElements();
16151   MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16152
16153   SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16154   // Prepare truncation shuffle mask
16155   for (unsigned i = 0; i != NumElems; ++i)
16156     MaskVec[i] = i * 2;
16157   In = DAG.getBitcast(NVT, In);
16158   SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16159   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16160                      DAG.getIntPtrConstant(0, DL));
16161 }
16162
16163 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16164   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16165   MVT VT = Op.getSimpleValueType();
16166
16167   if (VT.isVector()) {
16168     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16169     SDValue Src = Op.getOperand(0);
16170     SDLoc dl(Op);
16171     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16172       return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16173                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16174                                      DAG.getUNDEF(MVT::v2f32)));
16175     }
16176
16177     return SDValue();
16178   }
16179
16180   assert(!VT.isVector());
16181
16182   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16183     IsSigned, /*IsReplace=*/ false);
16184   SDValue FIST = Vals.first, StackSlot = Vals.second;
16185   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16186   if (!FIST.getNode())
16187     return Op;
16188
16189   if (StackSlot.getNode())
16190     // Load the result.
16191     return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16192
16193   // The node is the result.
16194   return FIST;
16195 }
16196
16197 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16198   SDLoc DL(Op);
16199   MVT VT = Op.getSimpleValueType();
16200   SDValue In = Op.getOperand(0);
16201   MVT SVT = In.getSimpleValueType();
16202
16203   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16204
16205   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16206                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16207                                  In, DAG.getUNDEF(SVT)));
16208 }
16209
16210 /// The only differences between FABS and FNEG are the mask and the logic op.
16211 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16212 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16213   assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16214          "Wrong opcode for lowering FABS or FNEG.");
16215
16216   bool IsFABS = (Op.getOpcode() == ISD::FABS);
16217
16218   // If this is a FABS and it has an FNEG user, bail out to fold the combination
16219   // into an FNABS. We'll lower the FABS after that if it is still in use.
16220   if (IsFABS)
16221     for (SDNode *User : Op->uses())
16222       if (User->getOpcode() == ISD::FNEG)
16223         return Op;
16224
16225   SDLoc dl(Op);
16226   MVT VT = Op.getSimpleValueType();
16227
16228   bool IsF128 = (VT == MVT::f128);
16229
16230   // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16231   // decide if we should generate a 16-byte constant mask when we only need 4 or
16232   // 8 bytes for the scalar case.
16233
16234   MVT LogicVT;
16235   MVT EltVT;
16236
16237   if (VT.isVector()) {
16238     LogicVT = VT;
16239     EltVT = VT.getVectorElementType();
16240   } else if (IsF128) {
16241     // SSE instructions are used for optimized f128 logical operations.
16242     LogicVT = MVT::f128;
16243     EltVT = VT;
16244   } else {
16245     // There are no scalar bitwise logical SSE/AVX instructions, so we
16246     // generate a 16-byte vector constant and logic op even for the scalar case.
16247     // Using a 16-byte mask allows folding the load of the mask with
16248     // the logic op, so it can save (~4 bytes) on code size.
16249     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16250     EltVT = VT;
16251   }
16252
16253   unsigned EltBits = EltVT.getSizeInBits();
16254   // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16255   APInt MaskElt =
16256     IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16257   const fltSemantics &Sem =
16258       EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16259           (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16260   SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16261
16262   SDValue Op0 = Op.getOperand(0);
16263   bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16264   unsigned LogicOp =
16265     IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16266   SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16267
16268   if (VT.isVector() || IsF128)
16269     return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16270
16271   // For the scalar case extend to a 128-bit vector, perform the logic op,
16272   // and extract the scalar result back out.
16273   Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16274   SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16275   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16276                      DAG.getIntPtrConstant(0, dl));
16277 }
16278
16279 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16280   SDValue Mag = Op.getOperand(0);
16281   SDValue Sign = Op.getOperand(1);
16282   SDLoc dl(Op);
16283
16284   // If the sign operand is smaller, extend it first.
16285   MVT VT = Op.getSimpleValueType();
16286   if (Sign.getSimpleValueType().bitsLT(VT))
16287     Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16288
16289   // And if it is bigger, shrink it first.
16290   if (Sign.getSimpleValueType().bitsGT(VT))
16291     Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16292
16293   // At this point the operands and the result should have the same
16294   // type, and that won't be f80 since that is not custom lowered.
16295   bool IsF128 = (VT == MVT::f128);
16296   assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16297           VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16298           VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16299          "Unexpected type in LowerFCOPYSIGN");
16300
16301   MVT EltVT = VT.getScalarType();
16302   const fltSemantics &Sem =
16303       EltVT == MVT::f64 ? APFloat::IEEEdouble()
16304                         : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16305
16306   // Perform all scalar logic operations as 16-byte vectors because there are no
16307   // scalar FP logic instructions in SSE.
16308   // TODO: This isn't necessary. If we used scalar types, we might avoid some
16309   // unnecessary splats, but we might miss load folding opportunities. Should
16310   // this decision be based on OptimizeForSize?
16311   bool IsFakeVector = !VT.isVector() && !IsF128;
16312   MVT LogicVT = VT;
16313   if (IsFakeVector)
16314     LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16315
16316   // The mask constants are automatically splatted for vector types.
16317   unsigned EltSizeInBits = VT.getScalarSizeInBits();
16318   SDValue SignMask = DAG.getConstantFP(
16319       APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16320   SDValue MagMask = DAG.getConstantFP(
16321       APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16322
16323   // First, clear all bits but the sign bit from the second operand (sign).
16324   if (IsFakeVector)
16325     Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16326   SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16327
16328   // Next, clear the sign bit from the first operand (magnitude).
16329   // TODO: If we had general constant folding for FP logic ops, this check
16330   // wouldn't be necessary.
16331   SDValue MagBits;
16332   if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16333     APFloat APF = Op0CN->getValueAPF();
16334     APF.clearSign();
16335     MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16336   } else {
16337     // If the magnitude operand wasn't a constant, we need to AND out the sign.
16338     if (IsFakeVector)
16339       Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16340     MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16341   }
16342
16343   // OR the magnitude value with the sign bit.
16344   SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16345   return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16346                                           DAG.getIntPtrConstant(0, dl));
16347 }
16348
16349 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16350   SDValue N0 = Op.getOperand(0);
16351   SDLoc dl(Op);
16352   MVT VT = Op.getSimpleValueType();
16353
16354   MVT OpVT = N0.getSimpleValueType();
16355   assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16356          "Unexpected type for FGETSIGN");
16357
16358   // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16359   MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16360   SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16361   Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16362   Res = DAG.getZExtOrTrunc(Res, dl, VT);
16363   Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16364   return Res;
16365 }
16366
16367 // Check whether an OR'd tree is PTEST-able.
16368 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16369                                       SelectionDAG &DAG) {
16370   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16371
16372   if (!Subtarget.hasSSE41())
16373     return SDValue();
16374
16375   if (!Op->hasOneUse())
16376     return SDValue();
16377
16378   SDNode *N = Op.getNode();
16379   SDLoc DL(N);
16380
16381   SmallVector<SDValue, 8> Opnds;
16382   DenseMap<SDValue, unsigned> VecInMap;
16383   SmallVector<SDValue, 8> VecIns;
16384   EVT VT = MVT::Other;
16385
16386   // Recognize a special case where a vector is casted into wide integer to
16387   // test all 0s.
16388   Opnds.push_back(N->getOperand(0));
16389   Opnds.push_back(N->getOperand(1));
16390
16391   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16392     SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16393     // BFS traverse all OR'd operands.
16394     if (I->getOpcode() == ISD::OR) {
16395       Opnds.push_back(I->getOperand(0));
16396       Opnds.push_back(I->getOperand(1));
16397       // Re-evaluate the number of nodes to be traversed.
16398       e += 2; // 2 more nodes (LHS and RHS) are pushed.
16399       continue;
16400     }
16401
16402     // Quit if a non-EXTRACT_VECTOR_ELT
16403     if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16404       return SDValue();
16405
16406     // Quit if without a constant index.
16407     SDValue Idx = I->getOperand(1);
16408     if (!isa<ConstantSDNode>(Idx))
16409       return SDValue();
16410
16411     SDValue ExtractedFromVec = I->getOperand(0);
16412     DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16413     if (M == VecInMap.end()) {
16414       VT = ExtractedFromVec.getValueType();
16415       // Quit if not 128/256-bit vector.
16416       if (!VT.is128BitVector() && !VT.is256BitVector())
16417         return SDValue();
16418       // Quit if not the same type.
16419       if (VecInMap.begin() != VecInMap.end() &&
16420           VT != VecInMap.begin()->first.getValueType())
16421         return SDValue();
16422       M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16423       VecIns.push_back(ExtractedFromVec);
16424     }
16425     M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16426   }
16427
16428   assert((VT.is128BitVector() || VT.is256BitVector()) &&
16429          "Not extracted from 128-/256-bit vector.");
16430
16431   unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16432
16433   for (DenseMap<SDValue, unsigned>::const_iterator
16434         I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16435     // Quit if not all elements are used.
16436     if (I->second != FullMask)
16437       return SDValue();
16438   }
16439
16440   MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16441
16442   // Cast all vectors into TestVT for PTEST.
16443   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16444     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16445
16446   // If more than one full vector is evaluated, OR them first before PTEST.
16447   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16448     // Each iteration will OR 2 nodes and append the result until there is only
16449     // 1 node left, i.e. the final OR'd value of all vectors.
16450     SDValue LHS = VecIns[Slot];
16451     SDValue RHS = VecIns[Slot + 1];
16452     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16453   }
16454
16455   return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16456 }
16457
16458 /// \brief return true if \c Op has a use that doesn't just read flags.
16459 static bool hasNonFlagsUse(SDValue Op) {
16460   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16461        ++UI) {
16462     SDNode *User = *UI;
16463     unsigned UOpNo = UI.getOperandNo();
16464     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16465       // Look pass truncate.
16466       UOpNo = User->use_begin().getOperandNo();
16467       User = *User->use_begin();
16468     }
16469
16470     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16471         !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16472       return true;
16473   }
16474   return false;
16475 }
16476
16477 // Emit KTEST instruction for bit vectors on AVX-512
16478 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16479                          const X86Subtarget &Subtarget) {
16480   if (Op.getOpcode() == ISD::BITCAST) {
16481     auto hasKTEST = [&](MVT VT) {
16482       unsigned SizeInBits = VT.getSizeInBits();
16483       return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16484         (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16485     };
16486     SDValue Op0 = Op.getOperand(0);
16487     MVT Op0VT = Op0.getValueType().getSimpleVT();
16488     if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16489         hasKTEST(Op0VT))
16490       return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16491   }
16492   return SDValue();
16493 }
16494
16495 /// Emit nodes that will be selected as "test Op0,Op0", or something
16496 /// equivalent.
16497 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16498                                     SelectionDAG &DAG) const {
16499   if (Op.getValueType() == MVT::i1) {
16500     SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16501     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16502                        DAG.getConstant(0, dl, MVT::i8));
16503   }
16504   // CF and OF aren't always set the way we want. Determine which
16505   // of these we need.
16506   bool NeedCF = false;
16507   bool NeedOF = false;
16508   switch (X86CC) {
16509   default: break;
16510   case X86::COND_A: case X86::COND_AE:
16511   case X86::COND_B: case X86::COND_BE:
16512     NeedCF = true;
16513     break;
16514   case X86::COND_G: case X86::COND_GE:
16515   case X86::COND_L: case X86::COND_LE:
16516   case X86::COND_O: case X86::COND_NO: {
16517     // Check if we really need to set the
16518     // Overflow flag. If NoSignedWrap is present
16519     // that is not actually needed.
16520     switch (Op->getOpcode()) {
16521     case ISD::ADD:
16522     case ISD::SUB:
16523     case ISD::MUL:
16524     case ISD::SHL:
16525       if (Op.getNode()->getFlags().hasNoSignedWrap())
16526         break;
16527       LLVM_FALLTHROUGH;
16528     default:
16529       NeedOF = true;
16530       break;
16531     }
16532     break;
16533   }
16534   }
16535   // See if we can use the EFLAGS value from the operand instead of
16536   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16537   // we prove that the arithmetic won't overflow, we can't use OF or CF.
16538   if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16539     // Emit KTEST for bit vectors
16540     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16541       return Node;
16542     // Emit a CMP with 0, which is the TEST pattern.
16543     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16544                        DAG.getConstant(0, dl, Op.getValueType()));
16545   }
16546   unsigned Opcode = 0;
16547   unsigned NumOperands = 0;
16548
16549   // Truncate operations may prevent the merge of the SETCC instruction
16550   // and the arithmetic instruction before it. Attempt to truncate the operands
16551   // of the arithmetic instruction and use a reduced bit-width instruction.
16552   bool NeedTruncation = false;
16553   SDValue ArithOp = Op;
16554   if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16555     SDValue Arith = Op->getOperand(0);
16556     // Both the trunc and the arithmetic op need to have one user each.
16557     if (Arith->hasOneUse())
16558       switch (Arith.getOpcode()) {
16559         default: break;
16560         case ISD::ADD:
16561         case ISD::SUB:
16562         case ISD::AND:
16563         case ISD::OR:
16564         case ISD::XOR: {
16565           NeedTruncation = true;
16566           ArithOp = Arith;
16567         }
16568       }
16569   }
16570
16571   // Sometimes flags can be set either with an AND or with an SRL/SHL
16572   // instruction. SRL/SHL variant should be preferred for masks longer than this
16573   // number of bits.
16574   const int ShiftToAndMaxMaskWidth = 32;
16575   const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16576
16577   // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16578   // which may be the result of a CAST.  We use the variable 'Op', which is the
16579   // non-casted variable when we check for possible users.
16580   switch (ArithOp.getOpcode()) {
16581   case ISD::ADD:
16582     // Due to an isel shortcoming, be conservative if this add is likely to be
16583     // selected as part of a load-modify-store instruction. When the root node
16584     // in a match is a store, isel doesn't know how to remap non-chain non-flag
16585     // uses of other nodes in the match, such as the ADD in this case. This
16586     // leads to the ADD being left around and reselected, with the result being
16587     // two adds in the output.  Alas, even if none our users are stores, that
16588     // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
16589     // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
16590     // climbing the DAG back to the root, and it doesn't seem to be worth the
16591     // effort.
16592     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16593          UE = Op.getNode()->use_end(); UI != UE; ++UI)
16594       if (UI->getOpcode() != ISD::CopyToReg &&
16595           UI->getOpcode() != ISD::SETCC &&
16596           UI->getOpcode() != ISD::STORE)
16597         goto default_case;
16598
16599     if (ConstantSDNode *C =
16600         dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16601       // An add of one will be selected as an INC.
16602       if (C->isOne() && !Subtarget.slowIncDec()) {
16603         Opcode = X86ISD::INC;
16604         NumOperands = 1;
16605         break;
16606       }
16607
16608       // An add of negative one (subtract of one) will be selected as a DEC.
16609       if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16610         Opcode = X86ISD::DEC;
16611         NumOperands = 1;
16612         break;
16613       }
16614     }
16615
16616     // Otherwise use a regular EFLAGS-setting add.
16617     Opcode = X86ISD::ADD;
16618     NumOperands = 2;
16619     break;
16620   case ISD::SHL:
16621   case ISD::SRL:
16622     // If we have a constant logical shift that's only used in a comparison
16623     // against zero turn it into an equivalent AND. This allows turning it into
16624     // a TEST instruction later.
16625     if (ZeroCheck && Op->hasOneUse() &&
16626         isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16627       EVT VT = Op.getValueType();
16628       unsigned BitWidth = VT.getSizeInBits();
16629       unsigned ShAmt = Op->getConstantOperandVal(1);
16630       if (ShAmt >= BitWidth) // Avoid undefined shifts.
16631         break;
16632       APInt Mask = ArithOp.getOpcode() == ISD::SRL
16633                        ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16634                        : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16635       if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16636         break;
16637       Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16638                        DAG.getConstant(Mask, dl, VT));
16639     }
16640     break;
16641
16642   case ISD::AND:
16643     // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16644     // because a TEST instruction will be better. However, AND should be
16645     // preferred if the instruction can be combined into ANDN.
16646     if (!hasNonFlagsUse(Op)) {
16647       SDValue Op0 = ArithOp->getOperand(0);
16648       SDValue Op1 = ArithOp->getOperand(1);
16649       EVT VT = ArithOp.getValueType();
16650       bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16651       bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16652       bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16653
16654       // If we cannot select an ANDN instruction, check if we can replace
16655       // AND+IMM64 with a shift before giving up. This is possible for masks
16656       // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16657       if (!isProperAndn) {
16658         if (!ZeroCheck)
16659           break;
16660
16661         assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16662         auto *CN = dyn_cast<ConstantSDNode>(Op1);
16663         if (!CN)
16664           break;
16665
16666         const APInt &Mask = CN->getAPIntValue();
16667         if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16668           break; // Prefer TEST instruction.
16669
16670         unsigned BitWidth = Mask.getBitWidth();
16671         unsigned LeadingOnes = Mask.countLeadingOnes();
16672         unsigned TrailingZeros = Mask.countTrailingZeros();
16673
16674         if (LeadingOnes + TrailingZeros == BitWidth) {
16675           assert(TrailingZeros < VT.getSizeInBits() &&
16676                  "Shift amount should be less than the type width");
16677           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16678           SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16679           Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16680           break;
16681         }
16682
16683         unsigned LeadingZeros = Mask.countLeadingZeros();
16684         unsigned TrailingOnes = Mask.countTrailingOnes();
16685
16686         if (LeadingZeros + TrailingOnes == BitWidth) {
16687           assert(LeadingZeros < VT.getSizeInBits() &&
16688                  "Shift amount should be less than the type width");
16689           MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16690           SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16691           Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16692           break;
16693         }
16694
16695         break;
16696       }
16697     }
16698     LLVM_FALLTHROUGH;
16699   case ISD::SUB:
16700   case ISD::OR:
16701   case ISD::XOR:
16702     // Due to the ISEL shortcoming noted above, be conservative if this op is
16703     // likely to be selected as part of a load-modify-store instruction.
16704     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16705            UE = Op.getNode()->use_end(); UI != UE; ++UI)
16706       if (UI->getOpcode() == ISD::STORE)
16707         goto default_case;
16708
16709     // Otherwise use a regular EFLAGS-setting instruction.
16710     switch (ArithOp.getOpcode()) {
16711     default: llvm_unreachable("unexpected operator!");
16712     case ISD::SUB: Opcode = X86ISD::SUB; break;
16713     case ISD::XOR: Opcode = X86ISD::XOR; break;
16714     case ISD::AND: Opcode = X86ISD::AND; break;
16715     case ISD::OR: {
16716       if (!NeedTruncation && ZeroCheck) {
16717         if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16718           return EFLAGS;
16719       }
16720       Opcode = X86ISD::OR;
16721       break;
16722     }
16723     }
16724
16725     NumOperands = 2;
16726     break;
16727   case X86ISD::ADD:
16728   case X86ISD::SUB:
16729   case X86ISD::INC:
16730   case X86ISD::DEC:
16731   case X86ISD::OR:
16732   case X86ISD::XOR:
16733   case X86ISD::AND:
16734     return SDValue(Op.getNode(), 1);
16735   default:
16736   default_case:
16737     break;
16738   }
16739
16740   // If we found that truncation is beneficial, perform the truncation and
16741   // update 'Op'.
16742   if (NeedTruncation) {
16743     EVT VT = Op.getValueType();
16744     SDValue WideVal = Op->getOperand(0);
16745     EVT WideVT = WideVal.getValueType();
16746     unsigned ConvertedOp = 0;
16747     // Use a target machine opcode to prevent further DAGCombine
16748     // optimizations that may separate the arithmetic operations
16749     // from the setcc node.
16750     switch (WideVal.getOpcode()) {
16751       default: break;
16752       case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16753       case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16754       case ISD::AND: ConvertedOp = X86ISD::AND; break;
16755       case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
16756       case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16757     }
16758
16759     if (ConvertedOp) {
16760       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16761       if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16762         SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16763         SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16764         Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16765       }
16766     }
16767   }
16768
16769   if (Opcode == 0) {
16770     // Emit KTEST for bit vectors
16771     if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16772       return Node;
16773
16774     // Emit a CMP with 0, which is the TEST pattern.
16775     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16776                        DAG.getConstant(0, dl, Op.getValueType()));
16777   }
16778   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16779   SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16780
16781   SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16782   DAG.ReplaceAllUsesWith(Op, New);
16783   return SDValue(New.getNode(), 1);
16784 }
16785
16786 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16787 /// equivalent.
16788 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16789                                    const SDLoc &dl, SelectionDAG &DAG) const {
16790   if (isNullConstant(Op1))
16791     return EmitTest(Op0, X86CC, dl, DAG);
16792
16793   assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16794          "Unexpected comparison operation for MVT::i1 operands");
16795
16796   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16797        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16798     // Only promote the compare up to I32 if it is a 16 bit operation
16799     // with an immediate.  16 bit immediates are to be avoided.
16800     if ((Op0.getValueType() == MVT::i16 &&
16801          (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16802         !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16803         !Subtarget.isAtom()) {
16804       unsigned ExtendOp =
16805           isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16806       Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16807       Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16808     }
16809     // Use SUB instead of CMP to enable CSE between SUB and CMP.
16810     SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16811     SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16812                               Op0, Op1);
16813     return SDValue(Sub.getNode(), 1);
16814   }
16815   return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16816 }
16817
16818 /// Convert a comparison if required by the subtarget.
16819 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16820                                                  SelectionDAG &DAG) const {
16821   // If the subtarget does not support the FUCOMI instruction, floating-point
16822   // comparisons have to be converted.
16823   if (Subtarget.hasCMov() ||
16824       Cmp.getOpcode() != X86ISD::CMP ||
16825       !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16826       !Cmp.getOperand(1).getValueType().isFloatingPoint())
16827     return Cmp;
16828
16829   // The instruction selector will select an FUCOM instruction instead of
16830   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16831   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16832   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16833   SDLoc dl(Cmp);
16834   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16835   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16836   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16837                             DAG.getConstant(8, dl, MVT::i8));
16838   SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16839
16840   // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16841   assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16842   return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16843 }
16844
16845 /// Check if replacement of SQRT with RSQRT should be disabled.
16846 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16847   EVT VT = Op.getValueType();
16848
16849   // We never want to use both SQRT and RSQRT instructions for the same input.
16850   if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16851     return false;
16852
16853   if (VT.isVector())
16854     return Subtarget.hasFastVectorFSQRT();
16855   return Subtarget.hasFastScalarFSQRT();
16856 }
16857
16858 /// The minimum architected relative accuracy is 2^-12. We need one
16859 /// Newton-Raphson step to have a good float result (24 bits of precision).
16860 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16861                                            SelectionDAG &DAG, int Enabled,
16862                                            int &RefinementSteps,
16863                                            bool &UseOneConstNR,
16864                                            bool Reciprocal) const {
16865   EVT VT = Op.getValueType();
16866
16867   // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16868   // TODO: Add support for AVX512 (v16f32).
16869   // It is likely not profitable to do this for f64 because a double-precision
16870   // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16871   // instructions: convert to single, rsqrtss, convert back to double, refine
16872   // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16873   // along with FMA, this could be a throughput win.
16874   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16875       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16876       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16877     if (RefinementSteps == ReciprocalEstimate::Unspecified)
16878       RefinementSteps = 1;
16879
16880     UseOneConstNR = false;
16881     return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16882   }
16883   return SDValue();
16884 }
16885
16886 /// The minimum architected relative accuracy is 2^-12. We need one
16887 /// Newton-Raphson step to have a good float result (24 bits of precision).
16888 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16889                                             int Enabled,
16890                                             int &RefinementSteps) const {
16891   EVT VT = Op.getValueType();
16892
16893   // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16894   // TODO: Add support for AVX512 (v16f32).
16895   // It is likely not profitable to do this for f64 because a double-precision
16896   // reciprocal estimate with refinement on x86 prior to FMA requires
16897   // 15 instructions: convert to single, rcpss, convert back to double, refine
16898   // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16899   // along with FMA, this could be a throughput win.
16900
16901   if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16902       (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16903       (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16904     // Enable estimate codegen with 1 refinement step for vector division.
16905     // Scalar division estimates are disabled because they break too much
16906     // real-world code. These defaults are intended to match GCC behavior.
16907     if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16908       return SDValue();
16909
16910     if (RefinementSteps == ReciprocalEstimate::Unspecified)
16911       RefinementSteps = 1;
16912
16913     return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16914   }
16915   return SDValue();
16916 }
16917
16918 /// If we have at least two divisions that use the same divisor, convert to
16919 /// multiplication by a reciprocal. This may need to be adjusted for a given
16920 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16921 /// This is because we still need one division to calculate the reciprocal and
16922 /// then we need two multiplies by that reciprocal as replacements for the
16923 /// original divisions.
16924 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16925   return 2;
16926 }
16927
16928 /// Helper for creating a X86ISD::SETCC node.
16929 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16930                         SelectionDAG &DAG) {
16931   return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16932                      DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16933 }
16934
16935 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16936 /// according to equal/not-equal condition code \p CC.
16937 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16938                                    const SDLoc &dl, SelectionDAG &DAG) {
16939   // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
16940   // instruction.  Since the shift amount is in-range-or-undefined, we know
16941   // that doing a bittest on the i32 value is ok.  We extend to i32 because
16942   // the encoding for the i16 version is larger than the i32 version.
16943   // Also promote i16 to i32 for performance / code size reason.
16944   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16945     Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16946
16947   // See if we can use the 32-bit instruction instead of the 64-bit one for a
16948   // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16949   // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16950   // known to be zero.
16951   if (Src.getValueType() == MVT::i64 &&
16952       DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16953     Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16954
16955   // If the operand types disagree, extend the shift amount to match.  Since
16956   // BT ignores high bits (like shifts) we can use anyextend.
16957   if (Src.getValueType() != BitNo.getValueType())
16958     BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16959
16960   SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16961   X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16962   return getSETCC(Cond, BT, dl , DAG);
16963 }
16964
16965 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16966 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16967                             const SDLoc &dl, SelectionDAG &DAG) {
16968   SDValue Op0 = And.getOperand(0);
16969   SDValue Op1 = And.getOperand(1);
16970   if (Op0.getOpcode() == ISD::TRUNCATE)
16971     Op0 = Op0.getOperand(0);
16972   if (Op1.getOpcode() == ISD::TRUNCATE)
16973     Op1 = Op1.getOperand(0);
16974
16975   SDValue LHS, RHS;
16976   if (Op1.getOpcode() == ISD::SHL)
16977     std::swap(Op0, Op1);
16978   if (Op0.getOpcode() == ISD::SHL) {
16979     if (isOneConstant(Op0.getOperand(0))) {
16980       // If we looked past a truncate, check that it's only truncating away
16981       // known zeros.
16982       unsigned BitWidth = Op0.getValueSizeInBits();
16983       unsigned AndBitWidth = And.getValueSizeInBits();
16984       if (BitWidth > AndBitWidth) {
16985         KnownBits Known;
16986         DAG.computeKnownBits(Op0, Known);
16987         if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16988           return SDValue();
16989       }
16990       LHS = Op1;
16991       RHS = Op0.getOperand(1);
16992     }
16993   } else if (Op1.getOpcode() == ISD::Constant) {
16994     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16995     uint64_t AndRHSVal = AndRHS->getZExtValue();
16996     SDValue AndLHS = Op0;
16997
16998     if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16999       LHS = AndLHS.getOperand(0);
17000       RHS = AndLHS.getOperand(1);
17001     }
17002
17003     // Use BT if the immediate can't be encoded in a TEST instruction.
17004     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17005       LHS = AndLHS;
17006       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17007     }
17008   }
17009
17010   if (LHS.getNode())
17011     return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17012
17013   return SDValue();
17014 }
17015
17016 // Convert (truncate (srl X, N) to i1) to (bt X, N)
17017 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
17018                                  const SDLoc &dl, SelectionDAG &DAG) {
17019
17020   assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
17021          "Expected TRUNCATE to i1 node");
17022
17023   if (Op.getOperand(0).getOpcode() != ISD::SRL)
17024     return SDValue();
17025
17026   SDValue ShiftRight = Op.getOperand(0);
17027   return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
17028                              CC, dl, DAG);
17029 }
17030
17031 /// Result of 'and' or 'trunc to i1' is compared against zero.
17032 /// Change to a BT node if possible.
17033 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
17034                                      const SDLoc &dl, SelectionDAG &DAG) const {
17035   if (Op.getOpcode() == ISD::AND)
17036     return LowerAndToBT(Op, CC, dl, DAG);
17037   if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
17038     return LowerTruncateToBT(Op, CC, dl, DAG);
17039   return SDValue();
17040 }
17041
17042 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17043 /// CMPs.
17044 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17045                               SDValue &Op1) {
17046   unsigned SSECC;
17047   bool Swap = false;
17048
17049   // SSE Condition code mapping:
17050   //  0 - EQ
17051   //  1 - LT
17052   //  2 - LE
17053   //  3 - UNORD
17054   //  4 - NEQ
17055   //  5 - NLT
17056   //  6 - NLE
17057   //  7 - ORD
17058   switch (SetCCOpcode) {
17059   default: llvm_unreachable("Unexpected SETCC condition");
17060   case ISD::SETOEQ:
17061   case ISD::SETEQ:  SSECC = 0; break;
17062   case ISD::SETOGT:
17063   case ISD::SETGT:  Swap = true; LLVM_FALLTHROUGH;
17064   case ISD::SETLT:
17065   case ISD::SETOLT: SSECC = 1; break;
17066   case ISD::SETOGE:
17067   case ISD::SETGE:  Swap = true; LLVM_FALLTHROUGH;
17068   case ISD::SETLE:
17069   case ISD::SETOLE: SSECC = 2; break;
17070   case ISD::SETUO:  SSECC = 3; break;
17071   case ISD::SETUNE:
17072   case ISD::SETNE:  SSECC = 4; break;
17073   case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17074   case ISD::SETUGE: SSECC = 5; break;
17075   case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17076   case ISD::SETUGT: SSECC = 6; break;
17077   case ISD::SETO:   SSECC = 7; break;
17078   case ISD::SETUEQ:
17079   case ISD::SETONE: SSECC = 8; break;
17080   }
17081   if (Swap)
17082     std::swap(Op0, Op1);
17083
17084   return SSECC;
17085 }
17086
17087 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17088 /// concatenate the result back.
17089 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17090   MVT VT = Op.getSimpleValueType();
17091
17092   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17093          "Unsupported value type for operation");
17094
17095   unsigned NumElems = VT.getVectorNumElements();
17096   SDLoc dl(Op);
17097   SDValue CC = Op.getOperand(2);
17098
17099   // Extract the LHS vectors
17100   SDValue LHS = Op.getOperand(0);
17101   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17102   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17103
17104   // Extract the RHS vectors
17105   SDValue RHS = Op.getOperand(1);
17106   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17107   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17108
17109   // Issue the operation on the smaller types and concatenate the result back
17110   MVT EltVT = VT.getVectorElementType();
17111   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17112   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17113                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17114                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17115 }
17116
17117 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17118   SDValue Op0 = Op.getOperand(0);
17119   SDValue Op1 = Op.getOperand(1);
17120   SDValue CC = Op.getOperand(2);
17121   MVT VT = Op.getSimpleValueType();
17122   SDLoc dl(Op);
17123
17124   assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17125          "Unexpected type for boolean compare operation");
17126   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17127   SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17128                                DAG.getConstant(-1, dl, VT));
17129   SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17130                                DAG.getConstant(-1, dl, VT));
17131   switch (SetCCOpcode) {
17132   default: llvm_unreachable("Unexpected SETCC condition");
17133   case ISD::SETEQ:
17134     // (x == y) -> ~(x ^ y)
17135     return DAG.getNode(ISD::XOR, dl, VT,
17136                        DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17137                        DAG.getConstant(-1, dl, VT));
17138   case ISD::SETNE:
17139     // (x != y) -> (x ^ y)
17140     return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17141   case ISD::SETUGT:
17142   case ISD::SETGT:
17143     // (x > y) -> (x & ~y)
17144     return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17145   case ISD::SETULT:
17146   case ISD::SETLT:
17147     // (x < y) -> (~x & y)
17148     return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17149   case ISD::SETULE:
17150   case ISD::SETLE:
17151     // (x <= y) -> (~x | y)
17152     return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17153   case ISD::SETUGE:
17154   case ISD::SETGE:
17155     // (x >=y) -> (x | ~y)
17156     return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17157   }
17158 }
17159
17160 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17161
17162   SDValue Op0 = Op.getOperand(0);
17163   SDValue Op1 = Op.getOperand(1);
17164   SDValue CC = Op.getOperand(2);
17165   MVT VT = Op.getSimpleValueType();
17166   SDLoc dl(Op);
17167
17168   assert(VT.getVectorElementType() == MVT::i1 &&
17169          "Cannot set masked compare for this operation");
17170
17171   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17172   unsigned  Opc = 0;
17173   bool Unsigned = false;
17174   bool Swap = false;
17175   unsigned SSECC;
17176   switch (SetCCOpcode) {
17177   default: llvm_unreachable("Unexpected SETCC condition");
17178   case ISD::SETNE:  SSECC = 4; break;
17179   case ISD::SETEQ:  Opc = X86ISD::PCMPEQM; break;
17180   case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17181   case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
17182   case ISD::SETGT:  Opc = X86ISD::PCMPGTM; break;
17183   case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17184   case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17185   case ISD::SETGE:  Swap = true; SSECC = 2; break; // LE + swap
17186   case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17187   case ISD::SETLE:  SSECC = 2; break;
17188   }
17189
17190   if (Swap)
17191     std::swap(Op0, Op1);
17192   if (Opc)
17193     return DAG.getNode(Opc, dl, VT, Op0, Op1);
17194   Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17195   return DAG.getNode(Opc, dl, VT, Op0, Op1,
17196                      DAG.getConstant(SSECC, dl, MVT::i8));
17197 }
17198
17199 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17200 /// operand \p Op1.  If non-trivial (for example because it's not constant)
17201 /// return an empty value.
17202 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17203                                       SelectionDAG &DAG) {
17204   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17205   if (!BV)
17206     return SDValue();
17207
17208   MVT VT = Op1.getSimpleValueType();
17209   MVT EVT = VT.getVectorElementType();
17210   unsigned n = VT.getVectorNumElements();
17211   SmallVector<SDValue, 8> ULTOp1;
17212
17213   for (unsigned i = 0; i < n; ++i) {
17214     ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17215     if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17216       return SDValue();
17217
17218     // Avoid underflow.
17219     APInt Val = Elt->getAPIntValue();
17220     if (Val == 0)
17221       return SDValue();
17222
17223     ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17224   }
17225
17226   return DAG.getBuildVector(VT, dl, ULTOp1);
17227 }
17228
17229 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17230                            SelectionDAG &DAG) {
17231   SDValue Op0 = Op.getOperand(0);
17232   SDValue Op1 = Op.getOperand(1);
17233   SDValue CC = Op.getOperand(2);
17234   MVT VT = Op.getSimpleValueType();
17235   ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17236   bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17237   SDLoc dl(Op);
17238
17239   if (isFP) {
17240 #ifndef NDEBUG
17241     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17242     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17243 #endif
17244
17245     unsigned Opc;
17246     if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17247       assert(VT.getVectorNumElements() <= 16);
17248       Opc = X86ISD::CMPM;
17249     } else {
17250       Opc = X86ISD::CMPP;
17251       // The SSE/AVX packed FP comparison nodes are defined with a
17252       // floating-point vector result that matches the operand type. This allows
17253       // them to work with an SSE1 target (integer vector types are not legal).
17254       VT = Op0.getSimpleValueType();
17255     }
17256
17257     // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17258     // emit two comparisons and a logic op to tie them together.
17259     // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17260     // available.
17261     SDValue Cmp;
17262     unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17263     if (SSECC == 8) {
17264       // LLVM predicate is SETUEQ or SETONE.
17265       unsigned CC0, CC1;
17266       unsigned CombineOpc;
17267       if (Cond == ISD::SETUEQ) {
17268         CC0 = 3; // UNORD
17269         CC1 = 0; // EQ
17270         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17271                                            static_cast<unsigned>(ISD::OR);
17272       } else {
17273         assert(Cond == ISD::SETONE);
17274         CC0 = 7; // ORD
17275         CC1 = 4; // NEQ
17276         CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17277                                            static_cast<unsigned>(ISD::AND);
17278       }
17279
17280       SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17281                                  DAG.getConstant(CC0, dl, MVT::i8));
17282       SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17283                                  DAG.getConstant(CC1, dl, MVT::i8));
17284       Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17285     } else {
17286       // Handle all other FP comparisons here.
17287       Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17288                         DAG.getConstant(SSECC, dl, MVT::i8));
17289     }
17290
17291     // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17292     // result type of SETCC. The bitcast is expected to be optimized away
17293     // during combining/isel.
17294     if (Opc == X86ISD::CMPP)
17295       Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17296
17297     return Cmp;
17298   }
17299
17300   MVT VTOp0 = Op0.getSimpleValueType();
17301   assert(VTOp0 == Op1.getSimpleValueType() &&
17302          "Expected operands with same type!");
17303   assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17304          "Invalid number of packed elements for source and destination!");
17305
17306   if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17307     // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17308     // legalizer to a wider vector type.  In the case of 'vsetcc' nodes, the
17309     // legalizer firstly checks if the first operand in input to the setcc has
17310     // a legal type. If so, then it promotes the return type to that same type.
17311     // Otherwise, the return type is promoted to the 'next legal type' which,
17312     // for a vector of MVT::i1 is always a 128-bit integer vector type.
17313     //
17314     // We reach this code only if the following two conditions are met:
17315     // 1. Both return type and operand type have been promoted to wider types
17316     //    by the type legalizer.
17317     // 2. The original operand type has been promoted to a 256-bit vector.
17318     //
17319     // Note that condition 2. only applies for AVX targets.
17320     SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17321     return DAG.getZExtOrTrunc(NewOp, dl, VT);
17322   }
17323
17324   // The non-AVX512 code below works under the assumption that source and
17325   // destination types are the same.
17326   assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17327          "Value types for source and destination must be the same!");
17328
17329   // Break 256-bit integer vector compare into smaller ones.
17330   if (VT.is256BitVector() && !Subtarget.hasInt256())
17331     return Lower256IntVSETCC(Op, DAG);
17332
17333   // Operands are boolean (vectors of i1)
17334   MVT OpVT = Op1.getSimpleValueType();
17335   if (OpVT.getVectorElementType() == MVT::i1)
17336     return LowerBoolVSETCC_AVX512(Op, DAG);
17337
17338   // The result is boolean, but operands are int/float
17339   if (VT.getVectorElementType() == MVT::i1) {
17340     // In AVX-512 architecture setcc returns mask with i1 elements,
17341     // But there is no compare instruction for i8 and i16 elements in KNL.
17342     // In this case use SSE compare
17343     bool UseAVX512Inst =
17344       (OpVT.is512BitVector() ||
17345        OpVT.getScalarSizeInBits() >= 32 ||
17346        (Subtarget.hasBWI() && Subtarget.hasVLX()));
17347
17348     if (UseAVX512Inst)
17349       return LowerIntVSETCC_AVX512(Op, DAG);
17350
17351     return DAG.getNode(ISD::TRUNCATE, dl, VT,
17352                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17353   }
17354
17355   // Lower using XOP integer comparisons.
17356   if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17357        VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17358     // Translate compare code to XOP PCOM compare mode.
17359     unsigned CmpMode = 0;
17360     switch (Cond) {
17361     default: llvm_unreachable("Unexpected SETCC condition");
17362     case ISD::SETULT:
17363     case ISD::SETLT: CmpMode = 0x00; break;
17364     case ISD::SETULE:
17365     case ISD::SETLE: CmpMode = 0x01; break;
17366     case ISD::SETUGT:
17367     case ISD::SETGT: CmpMode = 0x02; break;
17368     case ISD::SETUGE:
17369     case ISD::SETGE: CmpMode = 0x03; break;
17370     case ISD::SETEQ: CmpMode = 0x04; break;
17371     case ISD::SETNE: CmpMode = 0x05; break;
17372     }
17373
17374     // Are we comparing unsigned or signed integers?
17375     unsigned Opc =
17376         ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17377
17378     return DAG.getNode(Opc, dl, VT, Op0, Op1,
17379                        DAG.getConstant(CmpMode, dl, MVT::i8));
17380   }
17381
17382   // We are handling one of the integer comparisons here. Since SSE only has
17383   // GT and EQ comparisons for integer, swapping operands and multiple
17384   // operations may be required for some comparisons.
17385   unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17386                                                             : X86ISD::PCMPGT;
17387   bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17388               Cond == ISD::SETGE || Cond == ISD::SETUGE;
17389   bool Invert = Cond == ISD::SETNE ||
17390                 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17391
17392   // If both operands are known non-negative, then an unsigned compare is the
17393   // same as a signed compare and there's no need to flip signbits.
17394   // TODO: We could check for more general simplifications here since we're
17395   // computing known bits.
17396   bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17397                    !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17398
17399   // Special case: Use min/max operations for SETULE/SETUGE
17400   MVT VET = VT.getVectorElementType();
17401   bool HasMinMax =
17402       (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
17403       (Subtarget.hasSSE2() && (VET == MVT::i8));
17404   bool MinMax = false;
17405   if (HasMinMax) {
17406     switch (Cond) {
17407     default: break;
17408     case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17409     case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17410     }
17411
17412     if (MinMax)
17413       Swap = Invert = FlipSigns = false;
17414   }
17415
17416   bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17417   bool Subus = false;
17418   if (!MinMax && HasSubus) {
17419     // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17420     // Op0 u<= Op1:
17421     //   t = psubus Op0, Op1
17422     //   pcmpeq t, <0..0>
17423     switch (Cond) {
17424     default: break;
17425     case ISD::SETULT: {
17426       // If the comparison is against a constant we can turn this into a
17427       // setule.  With psubus, setule does not require a swap.  This is
17428       // beneficial because the constant in the register is no longer
17429       // destructed as the destination so it can be hoisted out of a loop.
17430       // Only do this pre-AVX since vpcmp* is no longer destructive.
17431       if (Subtarget.hasAVX())
17432         break;
17433       if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17434         Op1 = ULEOp1;
17435         Subus = true; Invert = false; Swap = false;
17436       }
17437       break;
17438     }
17439     // Psubus is better than flip-sign because it requires no inversion.
17440     case ISD::SETUGE: Subus = true; Invert = false; Swap = true;  break;
17441     case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17442     }
17443
17444     if (Subus) {
17445       Opc = X86ISD::SUBUS;
17446       FlipSigns = false;
17447     }
17448   }
17449
17450   if (Swap)
17451     std::swap(Op0, Op1);
17452
17453   // Check that the operation in question is available (most are plain SSE2,
17454   // but PCMPGTQ and PCMPEQQ have different requirements).
17455   if (VT == MVT::v2i64) {
17456     if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17457       assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17458
17459       // First cast everything to the right type.
17460       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17461       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17462
17463       // Since SSE has no unsigned integer comparisons, we need to flip the sign
17464       // bits of the inputs before performing those operations. The lower
17465       // compare is always unsigned.
17466       SDValue SB;
17467       if (FlipSigns) {
17468         SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17469       } else {
17470         SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17471         SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17472         SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17473       }
17474       Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17475       Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17476
17477       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17478       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17479       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17480
17481       // Create masks for only the low parts/high parts of the 64 bit integers.
17482       static const int MaskHi[] = { 1, 1, 3, 3 };
17483       static const int MaskLo[] = { 0, 0, 2, 2 };
17484       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17485       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17486       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17487
17488       SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17489       Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17490
17491       if (Invert)
17492         Result = DAG.getNOT(dl, Result, MVT::v4i32);
17493
17494       return DAG.getBitcast(VT, Result);
17495     }
17496
17497     if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17498       // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17499       // pcmpeqd + pshufd + pand.
17500       assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17501
17502       // First cast everything to the right type.
17503       Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17504       Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17505
17506       // Do the compare.
17507       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17508
17509       // Make sure the lower and upper halves are both all-ones.
17510       static const int Mask[] = { 1, 0, 3, 2 };
17511       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17512       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17513
17514       if (Invert)
17515         Result = DAG.getNOT(dl, Result, MVT::v4i32);
17516
17517       return DAG.getBitcast(VT, Result);
17518     }
17519   }
17520
17521   // Since SSE has no unsigned integer comparisons, we need to flip the sign
17522   // bits of the inputs before performing those operations.
17523   if (FlipSigns) {
17524     MVT EltVT = VT.getVectorElementType();
17525     SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17526                                  VT);
17527     Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17528     Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17529   }
17530
17531   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17532
17533   // If the logical-not of the result is required, perform that now.
17534   if (Invert)
17535     Result = DAG.getNOT(dl, Result, VT);
17536
17537   if (MinMax)
17538     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17539
17540   if (Subus)
17541     Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17542                          getZeroVector(VT, Subtarget, DAG, dl));
17543
17544   return Result;
17545 }
17546
17547 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17548
17549   MVT VT = Op.getSimpleValueType();
17550
17551   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17552
17553   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17554   SDValue Op0 = Op.getOperand(0);
17555   SDValue Op1 = Op.getOperand(1);
17556   SDLoc dl(Op);
17557   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17558
17559   // Optimize to BT if possible.
17560   // Lower (X & (1 << N)) == 0 to BT(X, N).
17561   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17562   // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17563   // Lower (trunc (X >> N) to i1) to BT(X, N).
17564   if (Op0.hasOneUse() && isNullConstant(Op1) &&
17565       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17566     if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17567       if (VT == MVT::i1)
17568         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17569       return NewSetCC;
17570     }
17571   }
17572
17573   // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
17574   // these.
17575   if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17576       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17577
17578     // If the input is a setcc, then reuse the input setcc or use a new one with
17579     // the inverted condition.
17580     if (Op0.getOpcode() == X86ISD::SETCC) {
17581       X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17582       bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17583       if (!Invert)
17584         return Op0;
17585
17586       CCode = X86::GetOppositeBranchCondition(CCode);
17587       SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17588       if (VT == MVT::i1)
17589         return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17590       return SetCC;
17591     }
17592   }
17593   if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17594     if (isOneConstant(Op1)) {
17595       ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17596       return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17597     }
17598     if (!isNullConstant(Op1)) {
17599       SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17600       return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17601     }
17602   }
17603
17604   bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17605   X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17606   if (X86CC == X86::COND_INVALID)
17607     return SDValue();
17608
17609   SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17610   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17611   SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17612   if (VT == MVT::i1)
17613     return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17614   return SetCC;
17615 }
17616
17617 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17618   SDValue LHS = Op.getOperand(0);
17619   SDValue RHS = Op.getOperand(1);
17620   SDValue Carry = Op.getOperand(2);
17621   SDValue Cond = Op.getOperand(3);
17622   SDLoc DL(Op);
17623
17624   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17625   X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17626
17627   // Recreate the carry if needed.
17628   EVT CarryVT = Carry.getValueType();
17629   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17630   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17631                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
17632
17633   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17634   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17635   SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17636   if (Op.getSimpleValueType() == MVT::i1)
17637     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17638   return SetCC;
17639 }
17640
17641 /// Return true if opcode is a X86 logical comparison.
17642 static bool isX86LogicalCmp(SDValue Op) {
17643   unsigned Opc = Op.getOpcode();
17644   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17645       Opc == X86ISD::SAHF)
17646     return true;
17647   if (Op.getResNo() == 1 &&
17648       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17649        Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17650        Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17651        Opc == X86ISD::XOR || Opc == X86ISD::AND))
17652     return true;
17653
17654   if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17655     return true;
17656
17657   return false;
17658 }
17659
17660 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17661   if (V.getOpcode() != ISD::TRUNCATE)
17662     return false;
17663
17664   SDValue VOp0 = V.getOperand(0);
17665   unsigned InBits = VOp0.getValueSizeInBits();
17666   unsigned Bits = V.getValueSizeInBits();
17667   return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17668 }
17669
17670 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17671   bool AddTest = true;
17672   SDValue Cond  = Op.getOperand(0);
17673   SDValue Op1 = Op.getOperand(1);
17674   SDValue Op2 = Op.getOperand(2);
17675   SDLoc DL(Op);
17676   MVT VT = Op1.getSimpleValueType();
17677   SDValue CC;
17678
17679   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17680   // are available or VBLENDV if AVX is available.
17681   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17682   if (Cond.getOpcode() == ISD::SETCC &&
17683       ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17684        (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17685       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17686     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17687     int SSECC = translateX86FSETCC(
17688         cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17689
17690     if (SSECC != 8) {
17691       if (Subtarget.hasAVX512()) {
17692         SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17693                                   CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17694         return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17695                            DL, VT, Cmp, Op1, Op2);
17696       }
17697
17698       SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17699                                 DAG.getConstant(SSECC, DL, MVT::i8));
17700
17701       // If we have AVX, we can use a variable vector select (VBLENDV) instead
17702       // of 3 logic instructions for size savings and potentially speed.
17703       // Unfortunately, there is no scalar form of VBLENDV.
17704
17705       // If either operand is a constant, don't try this. We can expect to
17706       // optimize away at least one of the logic instructions later in that
17707       // case, so that sequence would be faster than a variable blend.
17708
17709       // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17710       // uses XMM0 as the selection register. That may need just as many
17711       // instructions as the AND/ANDN/OR sequence due to register moves, so
17712       // don't bother.
17713
17714       if (Subtarget.hasAVX() &&
17715           !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17716
17717         // Convert to vectors, do a VSELECT, and convert back to scalar.
17718         // All of the conversions should be optimized away.
17719
17720         MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17721         SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17722         SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17723         SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17724
17725         MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17726         VCmp = DAG.getBitcast(VCmpVT, VCmp);
17727
17728         SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17729
17730         return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17731                            VSel, DAG.getIntPtrConstant(0, DL));
17732       }
17733       SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17734       SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17735       return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17736     }
17737   }
17738
17739   // AVX512 fallback is to lower selects of scalar floats to masked moves.
17740   if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17741     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17742     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17743   }
17744
17745   if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17746     SDValue Op1Scalar;
17747     if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17748       Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17749     else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17750       Op1Scalar = Op1.getOperand(0);
17751     SDValue Op2Scalar;
17752     if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17753       Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17754     else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17755       Op2Scalar = Op2.getOperand(0);
17756     if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17757       SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17758                                         Op1Scalar, Op2Scalar);
17759       if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17760         return DAG.getBitcast(VT, newSelect);
17761       SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17762       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17763                          DAG.getIntPtrConstant(0, DL));
17764     }
17765   }
17766
17767   if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17768     SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17769     Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17770                       DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17771     Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17772                       DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17773     SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17774     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17775   }
17776
17777   if (Cond.getOpcode() == ISD::SETCC) {
17778     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17779       Cond = NewCond;
17780       // If the condition was updated, it's possible that the operands of the
17781       // select were also updated (for example, EmitTest has a RAUW). Refresh
17782       // the local references to the select operands in case they got stale.
17783       Op1 = Op.getOperand(1);
17784       Op2 = Op.getOperand(2);
17785     }
17786   }
17787
17788   // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17789   // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17790   // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17791   // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17792   // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17793   // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17794   if (Cond.getOpcode() == X86ISD::SETCC &&
17795       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17796       isNullConstant(Cond.getOperand(1).getOperand(1))) {
17797     SDValue Cmp = Cond.getOperand(1);
17798     unsigned CondCode =
17799         cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17800
17801     if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17802         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17803       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17804       SDValue CmpOp0 = Cmp.getOperand(0);
17805
17806       // Apply further optimizations for special cases
17807       // (select (x != 0), -1, 0) -> neg & sbb
17808       // (select (x == 0), 0, -1) -> neg & sbb
17809       if (isNullConstant(Y) &&
17810           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17811         SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17812         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17813         SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17814         SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17815                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
17816                                   SDValue(Neg.getNode(), 1));
17817         return Res;
17818       }
17819
17820       Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17821                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17822       Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17823
17824       SDValue Res =   // Res = 0 or -1.
17825         DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17826                     DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17827
17828       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17829         Res = DAG.getNOT(DL, Res, Res.getValueType());
17830
17831       if (!isNullConstant(Op2))
17832         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17833       return Res;
17834     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17835                Cmp.getOperand(0).getOpcode() == ISD::AND &&
17836                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17837       SDValue CmpOp0 = Cmp.getOperand(0);
17838       SDValue Src1, Src2;
17839       // true if Op2 is XOR or OR operator and one of its operands
17840       // is equal to Op1
17841       // ( a , a op b) || ( b , a op b)
17842       auto isOrXorPattern = [&]() {
17843         if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17844             (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17845           Src1 =
17846               Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17847           Src2 = Op1;
17848           return true;
17849         }
17850         return false;
17851       };
17852
17853       if (isOrXorPattern()) {
17854         SDValue Neg;
17855         unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17856         // we need mask of all zeros or ones with same size of the other
17857         // operands.
17858         if (CmpSz > VT.getSizeInBits())
17859           Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17860         else if (CmpSz < VT.getSizeInBits())
17861           Neg = DAG.getNode(ISD::AND, DL, VT,
17862               DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17863               DAG.getConstant(1, DL, VT));
17864         else
17865           Neg = CmpOp0;
17866         SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17867                                    Neg); // -(and (x, 0x1))
17868         SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17869         return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2);  // And Op y
17870       }
17871     }
17872   }
17873
17874   // Look past (and (setcc_carry (cmp ...)), 1).
17875   if (Cond.getOpcode() == ISD::AND &&
17876       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17877       isOneConstant(Cond.getOperand(1)))
17878     Cond = Cond.getOperand(0);
17879
17880   // If condition flag is set by a X86ISD::CMP, then use it as the condition
17881   // setting operand in place of the X86ISD::SETCC.
17882   unsigned CondOpcode = Cond.getOpcode();
17883   if (CondOpcode == X86ISD::SETCC ||
17884       CondOpcode == X86ISD::SETCC_CARRY) {
17885     CC = Cond.getOperand(0);
17886
17887     SDValue Cmp = Cond.getOperand(1);
17888     unsigned Opc = Cmp.getOpcode();
17889     MVT VT = Op.getSimpleValueType();
17890
17891     bool IllegalFPCMov = false;
17892     if (VT.isFloatingPoint() && !VT.isVector() &&
17893         !isScalarFPTypeInSSEReg(VT))  // FPStack?
17894       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17895
17896     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17897         Opc == X86ISD::BT) { // FIXME
17898       Cond = Cmp;
17899       AddTest = false;
17900     }
17901   } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17902              CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17903              ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17904               Cond.getOperand(0).getValueType() != MVT::i8)) {
17905     SDValue LHS = Cond.getOperand(0);
17906     SDValue RHS = Cond.getOperand(1);
17907     unsigned X86Opcode;
17908     unsigned X86Cond;
17909     SDVTList VTs;
17910     switch (CondOpcode) {
17911     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17912     case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17913     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17914     case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17915     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17916     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17917     default: llvm_unreachable("unexpected overflowing operator");
17918     }
17919     if (CondOpcode == ISD::UMULO)
17920       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17921                           MVT::i32);
17922     else
17923       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17924
17925     SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17926
17927     if (CondOpcode == ISD::UMULO)
17928       Cond = X86Op.getValue(2);
17929     else
17930       Cond = X86Op.getValue(1);
17931
17932     CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17933     AddTest = false;
17934   }
17935
17936   if (AddTest) {
17937     // Look past the truncate if the high bits are known zero.
17938     if (isTruncWithZeroHighBitsInput(Cond, DAG))
17939       Cond = Cond.getOperand(0);
17940
17941     // We know the result of AND is compared against zero. Try to match
17942     // it to BT.
17943     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17944       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17945         CC = NewSetCC.getOperand(0);
17946         Cond = NewSetCC.getOperand(1);
17947         AddTest = false;
17948       }
17949     }
17950   }
17951
17952   if (AddTest) {
17953     CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17954     Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17955   }
17956
17957   // a <  b ? -1 :  0 -> RES = ~setcc_carry
17958   // a <  b ?  0 : -1 -> RES = setcc_carry
17959   // a >= b ? -1 :  0 -> RES = setcc_carry
17960   // a >= b ?  0 : -1 -> RES = ~setcc_carry
17961   if (Cond.getOpcode() == X86ISD::SUB) {
17962     Cond = ConvertCmpIfNecessary(Cond, DAG);
17963     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17964
17965     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17966         (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17967         (isNullConstant(Op1) || isNullConstant(Op2))) {
17968       SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17969                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17970                                 Cond);
17971       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17972         return DAG.getNOT(DL, Res, Res.getValueType());
17973       return Res;
17974     }
17975   }
17976
17977   // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17978   // widen the cmov and push the truncate through. This avoids introducing a new
17979   // branch during isel and doesn't add any extensions.
17980   if (Op.getValueType() == MVT::i8 &&
17981       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17982     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17983     if (T1.getValueType() == T2.getValueType() &&
17984         // Blacklist CopyFromReg to avoid partial register stalls.
17985         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17986       SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17987       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17988       return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17989     }
17990   }
17991
17992   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17993   // condition is true.
17994   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17995   SDValue Ops[] = { Op2, Op1, CC, Cond };
17996   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17997 }
17998
17999 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
18000                                        const X86Subtarget &Subtarget,
18001                                        SelectionDAG &DAG) {
18002   MVT VT = Op->getSimpleValueType(0);
18003   SDValue In = Op->getOperand(0);
18004   MVT InVT = In.getSimpleValueType();
18005   MVT VTElt = VT.getVectorElementType();
18006   MVT InVTElt = InVT.getVectorElementType();
18007   SDLoc dl(Op);
18008
18009   // SKX processor
18010   if ((InVTElt == MVT::i1) &&
18011       (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
18012
18013        ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
18014
18015     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18016
18017   unsigned NumElts = VT.getVectorNumElements();
18018
18019   if (VT.is512BitVector() && InVTElt != MVT::i1 &&
18020       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
18021     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
18022       return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
18023     return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
18024   }
18025
18026   if (InVTElt != MVT::i1)
18027     return SDValue();
18028
18029   MVT ExtVT = VT;
18030   if (!VT.is512BitVector() && !Subtarget.hasVLX())
18031     ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
18032
18033   SDValue V;
18034   if (Subtarget.hasDQI()) {
18035     V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
18036     assert(!VT.is512BitVector() && "Unexpected vector type");
18037   } else {
18038     SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
18039     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
18040     V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
18041     if (ExtVT == VT)
18042       return V;
18043   }
18044
18045   return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
18046 }
18047
18048 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18049 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18050 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18051 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18052 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18053                                         const X86Subtarget &Subtarget,
18054                                         SelectionDAG &DAG) {
18055   SDValue In = Op->getOperand(0);
18056   MVT VT = Op->getSimpleValueType(0);
18057   MVT InVT = In.getSimpleValueType();
18058   assert(VT.getSizeInBits() == InVT.getSizeInBits());
18059
18060   MVT SVT = VT.getVectorElementType();
18061   MVT InSVT = InVT.getVectorElementType();
18062   assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18063
18064   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18065     return SDValue();
18066   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18067     return SDValue();
18068   if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18069       !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18070       !(VT.is512BitVector() && Subtarget.hasAVX512()))
18071     return SDValue();
18072
18073   SDLoc dl(Op);
18074
18075   // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18076   // For 512-bit vectors, we need 128-bits or 256-bits.
18077   if (VT.getSizeInBits() > 128) {
18078     // Input needs to be at least the same number of elements as output, and
18079     // at least 128-bits.
18080     int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18081     In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18082   }
18083
18084   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18085           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18086
18087   // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18088   // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18089   // need to be handled here for 256/512-bit results.
18090   if (Subtarget.hasInt256()) {
18091     assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18092     unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18093                         X86ISD::VSEXT : X86ISD::VZEXT;
18094     return DAG.getNode(ExtOpc, dl, VT, In);
18095   }
18096
18097   // We should only get here for sign extend.
18098   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18099          "Unexpected opcode!");
18100
18101   // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18102   SDValue Curr = In;
18103   MVT CurrVT = InVT;
18104
18105   // As SRAI is only available on i16/i32 types, we expand only up to i32
18106   // and handle i64 separately.
18107   while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18108     Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18109     MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18110     CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18111     Curr = DAG.getBitcast(CurrVT, Curr);
18112   }
18113
18114   SDValue SignExt = Curr;
18115   if (CurrVT != InVT) {
18116     unsigned SignExtShift =
18117         CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18118     SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18119                           DAG.getConstant(SignExtShift, dl, MVT::i8));
18120   }
18121
18122   if (CurrVT == VT)
18123     return SignExt;
18124
18125   if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18126     SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18127                                DAG.getConstant(31, dl, MVT::i8));
18128     SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18129     return DAG.getBitcast(VT, Ext);
18130   }
18131
18132   return SDValue();
18133 }
18134
18135 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18136                                 SelectionDAG &DAG) {
18137   MVT VT = Op->getSimpleValueType(0);
18138   SDValue In = Op->getOperand(0);
18139   MVT InVT = In.getSimpleValueType();
18140   SDLoc dl(Op);
18141
18142   if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
18143     return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18144
18145   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18146       (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18147       (VT != MVT::v16i16 || InVT != MVT::v16i8))
18148     return SDValue();
18149
18150   if (Subtarget.hasInt256())
18151     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18152
18153   // Optimize vectors in AVX mode
18154   // Sign extend  v8i16 to v8i32 and
18155   //              v4i32 to v4i64
18156   //
18157   // Divide input vector into two parts
18158   // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18159   // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18160   // concat the vectors to original VT
18161
18162   unsigned NumElems = InVT.getVectorNumElements();
18163   SDValue Undef = DAG.getUNDEF(InVT);
18164
18165   SmallVector<int,8> ShufMask1(NumElems, -1);
18166   for (unsigned i = 0; i != NumElems/2; ++i)
18167     ShufMask1[i] = i;
18168
18169   SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18170
18171   SmallVector<int,8> ShufMask2(NumElems, -1);
18172   for (unsigned i = 0; i != NumElems/2; ++i)
18173     ShufMask2[i] = i + NumElems/2;
18174
18175   SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18176
18177   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18178                                 VT.getVectorNumElements() / 2);
18179
18180   OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18181   OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18182
18183   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18184 }
18185
18186 // Lower truncating store. We need a special lowering to vXi1 vectors
18187 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18188                                     SelectionDAG &DAG) {
18189   StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18190   SDLoc dl(St);
18191   EVT MemVT = St->getMemoryVT();
18192   assert(St->isTruncatingStore() && "We only custom truncating store.");
18193   assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18194          "Expected truncstore of i1 vector");
18195
18196   SDValue Op = St->getValue();
18197   MVT OpVT = Op.getValueType().getSimpleVT();
18198   unsigned NumElts = OpVT.getVectorNumElements();
18199   if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18200       NumElts == 16) {
18201     // Truncate and store - everything is legal
18202     Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18203     if (MemVT.getSizeInBits() < 8)
18204       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18205                        DAG.getUNDEF(MVT::v8i1), Op,
18206                        DAG.getIntPtrConstant(0, dl));
18207     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18208                         St->getMemOperand());
18209   }
18210
18211   // A subset, assume that we have only AVX-512F
18212   if (NumElts <= 8) {
18213     if (NumElts < 8) {
18214       // Extend to 8-elts vector
18215       MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18216       Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18217                         DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18218     }
18219     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18220     return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18221                         St->getMemOperand());
18222   }
18223   // v32i8
18224   assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18225   // Divide the vector into 2 parts and store each part separately
18226   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18227                             DAG.getIntPtrConstant(0, dl));
18228   Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18229   SDValue BasePtr = St->getBasePtr();
18230   SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18231                               St->getMemOperand());
18232   SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18233                             DAG.getIntPtrConstant(16, dl));
18234   Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18235
18236   SDValue BasePtrHi =
18237     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18238                 DAG.getConstant(2, dl, BasePtr.getValueType()));
18239
18240   SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18241                               BasePtrHi, St->getMemOperand());
18242   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18243 }
18244
18245 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18246                                            const X86Subtarget &Subtarget,
18247                                            SelectionDAG &DAG) {
18248
18249   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18250   SDLoc dl(Ld);
18251   EVT MemVT = Ld->getMemoryVT();
18252   assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18253          "Expected i1 vector load");
18254   unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18255     ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18256   MVT VT = Op.getValueType().getSimpleVT();
18257   unsigned NumElts = VT.getVectorNumElements();
18258
18259   if ((Subtarget.hasBWI() && NumElts >= 32) ||
18260       (Subtarget.hasDQI() && NumElts < 16) ||
18261       NumElts == 16) {
18262     // Load and extend - everything is legal
18263     if (NumElts < 8) {
18264       SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18265                                  Ld->getBasePtr(),
18266                                  Ld->getMemOperand());
18267       // Replace chain users with the new chain.
18268       assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18269       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18270       MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18271       SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18272
18273       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18274                                    DAG.getIntPtrConstant(0, dl));
18275     }
18276     SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18277                                Ld->getBasePtr(),
18278                                Ld->getMemOperand());
18279     // Replace chain users with the new chain.
18280     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18281     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18282
18283     // Finally, do a normal sign-extend to the desired register.
18284     return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18285   }
18286
18287   if (NumElts <= 8) {
18288     // A subset, assume that we have only AVX-512F
18289     unsigned NumBitsToLoad = 8;
18290     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18291     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18292                               Ld->getBasePtr(),
18293                               Ld->getMemOperand());
18294     // Replace chain users with the new chain.
18295     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18296     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18297
18298     MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18299     SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18300
18301     if (NumElts == 8)
18302       return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18303
18304       // we should take care to v4i1 and v2i1
18305
18306     MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18307     SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18308     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18309                         DAG.getIntPtrConstant(0, dl));
18310   }
18311
18312   assert(VT == MVT::v32i8 && "Unexpected extload type");
18313
18314   SmallVector<SDValue, 2> Chains;
18315
18316   SDValue BasePtr = Ld->getBasePtr();
18317   SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18318                                Ld->getBasePtr(),
18319                                Ld->getMemOperand());
18320   Chains.push_back(LoadLo.getValue(1));
18321
18322   SDValue BasePtrHi =
18323     DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18324                 DAG.getConstant(2, dl, BasePtr.getValueType()));
18325
18326   SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18327                                BasePtrHi,
18328                                Ld->getMemOperand());
18329   Chains.push_back(LoadHi.getValue(1));
18330   SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18331   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18332
18333   SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18334   SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18335   return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18336 }
18337
18338 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18339 // may emit an illegal shuffle but the expansion is still better than scalar
18340 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18341 // we'll emit a shuffle and a arithmetic shift.
18342 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18343 // TODO: It is possible to support ZExt by zeroing the undef values during
18344 // the shuffle phase or after the shuffle.
18345 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18346                                  SelectionDAG &DAG) {
18347   MVT RegVT = Op.getSimpleValueType();
18348   assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18349   assert(RegVT.isInteger() &&
18350          "We only custom lower integer vector sext loads.");
18351
18352   // Nothing useful we can do without SSE2 shuffles.
18353   assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18354
18355   LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18356   SDLoc dl(Ld);
18357   EVT MemVT = Ld->getMemoryVT();
18358   if (MemVT.getScalarType() == MVT::i1)
18359     return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18360
18361   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18362   unsigned RegSz = RegVT.getSizeInBits();
18363
18364   ISD::LoadExtType Ext = Ld->getExtensionType();
18365
18366   assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18367          && "Only anyext and sext are currently implemented.");
18368   assert(MemVT != RegVT && "Cannot extend to the same type");
18369   assert(MemVT.isVector() && "Must load a vector from memory");
18370
18371   unsigned NumElems = RegVT.getVectorNumElements();
18372   unsigned MemSz = MemVT.getSizeInBits();
18373   assert(RegSz > MemSz && "Register size must be greater than the mem size");
18374
18375   if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18376     // The only way in which we have a legal 256-bit vector result but not the
18377     // integer 256-bit operations needed to directly lower a sextload is if we
18378     // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18379     // a 128-bit vector and a normal sign_extend to 256-bits that should get
18380     // correctly legalized. We do this late to allow the canonical form of
18381     // sextload to persist throughout the rest of the DAG combiner -- it wants
18382     // to fold together any extensions it can, and so will fuse a sign_extend
18383     // of an sextload into a sextload targeting a wider value.
18384     SDValue Load;
18385     if (MemSz == 128) {
18386       // Just switch this to a normal load.
18387       assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18388                                        "it must be a legal 128-bit vector "
18389                                        "type!");
18390       Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18391                          Ld->getPointerInfo(), Ld->getAlignment(),
18392                          Ld->getMemOperand()->getFlags());
18393     } else {
18394       assert(MemSz < 128 &&
18395              "Can't extend a type wider than 128 bits to a 256 bit vector!");
18396       // Do an sext load to a 128-bit vector type. We want to use the same
18397       // number of elements, but elements half as wide. This will end up being
18398       // recursively lowered by this routine, but will succeed as we definitely
18399       // have all the necessary features if we're using AVX1.
18400       EVT HalfEltVT =
18401           EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18402       EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18403       Load =
18404           DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18405                          Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18406                          Ld->getMemOperand()->getFlags());
18407     }
18408
18409     // Replace chain users with the new chain.
18410     assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18411     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18412
18413     // Finally, do a normal sign-extend to the desired register.
18414     return DAG.getSExtOrTrunc(Load, dl, RegVT);
18415   }
18416
18417   // All sizes must be a power of two.
18418   assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18419          "Non-power-of-two elements are not custom lowered!");
18420
18421   // Attempt to load the original value using scalar loads.
18422   // Find the largest scalar type that divides the total loaded size.
18423   MVT SclrLoadTy = MVT::i8;
18424   for (MVT Tp : MVT::integer_valuetypes()) {
18425     if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18426       SclrLoadTy = Tp;
18427     }
18428   }
18429
18430   // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18431   if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18432       (64 <= MemSz))
18433     SclrLoadTy = MVT::f64;
18434
18435   // Calculate the number of scalar loads that we need to perform
18436   // in order to load our vector from memory.
18437   unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18438
18439   assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18440          "Can only lower sext loads with a single scalar load!");
18441
18442   unsigned loadRegZize = RegSz;
18443   if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18444     loadRegZize = 128;
18445
18446   // Represent our vector as a sequence of elements which are the
18447   // largest scalar that we can load.
18448   EVT LoadUnitVecVT = EVT::getVectorVT(
18449       *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18450
18451   // Represent the data using the same element type that is stored in
18452   // memory. In practice, we ''widen'' MemVT.
18453   EVT WideVecVT =
18454       EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18455                        loadRegZize / MemVT.getScalarSizeInBits());
18456
18457   assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18458          "Invalid vector type");
18459
18460   // We can't shuffle using an illegal type.
18461   assert(TLI.isTypeLegal(WideVecVT) &&
18462          "We only lower types that form legal widened vector types");
18463
18464   SmallVector<SDValue, 8> Chains;
18465   SDValue Ptr = Ld->getBasePtr();
18466   SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18467                                       TLI.getPointerTy(DAG.getDataLayout()));
18468   SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18469
18470   for (unsigned i = 0; i < NumLoads; ++i) {
18471     // Perform a single load.
18472     SDValue ScalarLoad =
18473         DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18474                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18475     Chains.push_back(ScalarLoad.getValue(1));
18476     // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18477     // another round of DAGCombining.
18478     if (i == 0)
18479       Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18480     else
18481       Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18482                         ScalarLoad, DAG.getIntPtrConstant(i, dl));
18483
18484     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18485   }
18486
18487   SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18488
18489   // Bitcast the loaded value to a vector of the original element type, in
18490   // the size of the target vector type.
18491   SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18492   unsigned SizeRatio = RegSz / MemSz;
18493
18494   if (Ext == ISD::SEXTLOAD) {
18495     // If we have SSE4.1, we can directly emit a VSEXT node.
18496     if (Subtarget.hasSSE41()) {
18497       SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18498       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18499       return Sext;
18500     }
18501
18502     // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18503     // lanes.
18504     assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18505            "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18506
18507     SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18508     DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18509     return Shuff;
18510   }
18511
18512   // Redistribute the loaded elements into the different locations.
18513   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18514   for (unsigned i = 0; i != NumElems; ++i)
18515     ShuffleVec[i * SizeRatio] = i;
18516
18517   SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18518                                        DAG.getUNDEF(WideVecVT), ShuffleVec);
18519
18520   // Bitcast to the requested type.
18521   Shuff = DAG.getBitcast(RegVT, Shuff);
18522   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18523   return Shuff;
18524 }
18525
18526 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18527 /// each of which has no other use apart from the AND / OR.
18528 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18529   Opc = Op.getOpcode();
18530   if (Opc != ISD::OR && Opc != ISD::AND)
18531     return false;
18532   return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18533           Op.getOperand(0).hasOneUse() &&
18534           Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18535           Op.getOperand(1).hasOneUse());
18536 }
18537
18538 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18539 /// SETCC node has a single use.
18540 static bool isXor1OfSetCC(SDValue Op) {
18541   if (Op.getOpcode() != ISD::XOR)
18542     return false;
18543   if (isOneConstant(Op.getOperand(1)))
18544     return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18545            Op.getOperand(0).hasOneUse();
18546   return false;
18547 }
18548
18549 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18550   bool addTest = true;
18551   SDValue Chain = Op.getOperand(0);
18552   SDValue Cond  = Op.getOperand(1);
18553   SDValue Dest  = Op.getOperand(2);
18554   SDLoc dl(Op);
18555   SDValue CC;
18556   bool Inverted = false;
18557
18558   if (Cond.getOpcode() == ISD::SETCC) {
18559     // Check for setcc([su]{add,sub,mul}o == 0).
18560     if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18561         isNullConstant(Cond.getOperand(1)) &&
18562         Cond.getOperand(0).getResNo() == 1 &&
18563         (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18564          Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18565          Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18566          Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18567          Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18568          Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18569       Inverted = true;
18570       Cond = Cond.getOperand(0);
18571     } else {
18572       if (SDValue NewCond = LowerSETCC(Cond, DAG))
18573         Cond = NewCond;
18574     }
18575   }
18576 #if 0
18577   // FIXME: LowerXALUO doesn't handle these!!
18578   else if (Cond.getOpcode() == X86ISD::ADD  ||
18579            Cond.getOpcode() == X86ISD::SUB  ||
18580            Cond.getOpcode() == X86ISD::SMUL ||
18581            Cond.getOpcode() == X86ISD::UMUL)
18582     Cond = LowerXALUO(Cond, DAG);
18583 #endif
18584
18585   // Look pass (and (setcc_carry (cmp ...)), 1).
18586   if (Cond.getOpcode() == ISD::AND &&
18587       Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18588       isOneConstant(Cond.getOperand(1)))
18589     Cond = Cond.getOperand(0);
18590
18591   // If condition flag is set by a X86ISD::CMP, then use it as the condition
18592   // setting operand in place of the X86ISD::SETCC.
18593   unsigned CondOpcode = Cond.getOpcode();
18594   if (CondOpcode == X86ISD::SETCC ||
18595       CondOpcode == X86ISD::SETCC_CARRY) {
18596     CC = Cond.getOperand(0);
18597
18598     SDValue Cmp = Cond.getOperand(1);
18599     unsigned Opc = Cmp.getOpcode();
18600     // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18601     if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18602       Cond = Cmp;
18603       addTest = false;
18604     } else {
18605       switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18606       default: break;
18607       case X86::COND_O:
18608       case X86::COND_B:
18609         // These can only come from an arithmetic instruction with overflow,
18610         // e.g. SADDO, UADDO.
18611         Cond = Cond.getOperand(1);
18612         addTest = false;
18613         break;
18614       }
18615     }
18616   }
18617   CondOpcode = Cond.getOpcode();
18618   if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18619       CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18620       ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18621        Cond.getOperand(0).getValueType() != MVT::i8)) {
18622     SDValue LHS = Cond.getOperand(0);
18623     SDValue RHS = Cond.getOperand(1);
18624     unsigned X86Opcode;
18625     unsigned X86Cond;
18626     SDVTList VTs;
18627     // Keep this in sync with LowerXALUO, otherwise we might create redundant
18628     // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18629     // X86ISD::INC).
18630     switch (CondOpcode) {
18631     case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18632     case ISD::SADDO:
18633       if (isOneConstant(RHS)) {
18634           X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18635           break;
18636         }
18637       X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18638     case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18639     case ISD::SSUBO:
18640       if (isOneConstant(RHS)) {
18641           X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18642           break;
18643         }
18644       X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18645     case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18646     case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18647     default: llvm_unreachable("unexpected overflowing operator");
18648     }
18649     if (Inverted)
18650       X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18651     if (CondOpcode == ISD::UMULO)
18652       VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18653                           MVT::i32);
18654     else
18655       VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18656
18657     SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18658
18659     if (CondOpcode == ISD::UMULO)
18660       Cond = X86Op.getValue(2);
18661     else
18662       Cond = X86Op.getValue(1);
18663
18664     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18665     addTest = false;
18666   } else {
18667     unsigned CondOpc;
18668     if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18669       SDValue Cmp = Cond.getOperand(0).getOperand(1);
18670       if (CondOpc == ISD::OR) {
18671         // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18672         // two branches instead of an explicit OR instruction with a
18673         // separate test.
18674         if (Cmp == Cond.getOperand(1).getOperand(1) &&
18675             isX86LogicalCmp(Cmp)) {
18676           CC = Cond.getOperand(0).getOperand(0);
18677           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18678                               Chain, Dest, CC, Cmp);
18679           CC = Cond.getOperand(1).getOperand(0);
18680           Cond = Cmp;
18681           addTest = false;
18682         }
18683       } else { // ISD::AND
18684         // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18685         // two branches instead of an explicit AND instruction with a
18686         // separate test. However, we only do this if this block doesn't
18687         // have a fall-through edge, because this requires an explicit
18688         // jmp when the condition is false.
18689         if (Cmp == Cond.getOperand(1).getOperand(1) &&
18690             isX86LogicalCmp(Cmp) &&
18691             Op.getNode()->hasOneUse()) {
18692           X86::CondCode CCode =
18693             (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18694           CCode = X86::GetOppositeBranchCondition(CCode);
18695           CC = DAG.getConstant(CCode, dl, MVT::i8);
18696           SDNode *User = *Op.getNode()->use_begin();
18697           // Look for an unconditional branch following this conditional branch.
18698           // We need this because we need to reverse the successors in order
18699           // to implement FCMP_OEQ.
18700           if (User->getOpcode() == ISD::BR) {
18701             SDValue FalseBB = User->getOperand(1);
18702             SDNode *NewBR =
18703               DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18704             assert(NewBR == User);
18705             (void)NewBR;
18706             Dest = FalseBB;
18707
18708             Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18709                                 Chain, Dest, CC, Cmp);
18710             X86::CondCode CCode =
18711               (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18712             CCode = X86::GetOppositeBranchCondition(CCode);
18713             CC = DAG.getConstant(CCode, dl, MVT::i8);
18714             Cond = Cmp;
18715             addTest = false;
18716           }
18717         }
18718       }
18719     } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18720       // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18721       // It should be transformed during dag combiner except when the condition
18722       // is set by a arithmetics with overflow node.
18723       X86::CondCode CCode =
18724         (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18725       CCode = X86::GetOppositeBranchCondition(CCode);
18726       CC = DAG.getConstant(CCode, dl, MVT::i8);
18727       Cond = Cond.getOperand(0).getOperand(1);
18728       addTest = false;
18729     } else if (Cond.getOpcode() == ISD::SETCC &&
18730                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18731       // For FCMP_OEQ, we can emit
18732       // two branches instead of an explicit AND instruction with a
18733       // separate test. However, we only do this if this block doesn't
18734       // have a fall-through edge, because this requires an explicit
18735       // jmp when the condition is false.
18736       if (Op.getNode()->hasOneUse()) {
18737         SDNode *User = *Op.getNode()->use_begin();
18738         // Look for an unconditional branch following this conditional branch.
18739         // We need this because we need to reverse the successors in order
18740         // to implement FCMP_OEQ.
18741         if (User->getOpcode() == ISD::BR) {
18742           SDValue FalseBB = User->getOperand(1);
18743           SDNode *NewBR =
18744             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18745           assert(NewBR == User);
18746           (void)NewBR;
18747           Dest = FalseBB;
18748
18749           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18750                                     Cond.getOperand(0), Cond.getOperand(1));
18751           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18752           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18753           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18754                               Chain, Dest, CC, Cmp);
18755           CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18756           Cond = Cmp;
18757           addTest = false;
18758         }
18759       }
18760     } else if (Cond.getOpcode() == ISD::SETCC &&
18761                cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18762       // For FCMP_UNE, we can emit
18763       // two branches instead of an explicit AND instruction with a
18764       // separate test. However, we only do this if this block doesn't
18765       // have a fall-through edge, because this requires an explicit
18766       // jmp when the condition is false.
18767       if (Op.getNode()->hasOneUse()) {
18768         SDNode *User = *Op.getNode()->use_begin();
18769         // Look for an unconditional branch following this conditional branch.
18770         // We need this because we need to reverse the successors in order
18771         // to implement FCMP_UNE.
18772         if (User->getOpcode() == ISD::BR) {
18773           SDValue FalseBB = User->getOperand(1);
18774           SDNode *NewBR =
18775             DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18776           assert(NewBR == User);
18777           (void)NewBR;
18778
18779           SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18780                                     Cond.getOperand(0), Cond.getOperand(1));
18781           Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18782           CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18783           Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18784                               Chain, Dest, CC, Cmp);
18785           CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18786           Cond = Cmp;
18787           addTest = false;
18788           Dest = FalseBB;
18789         }
18790       }
18791     }
18792   }
18793
18794   if (addTest) {
18795     // Look pass the truncate if the high bits are known zero.
18796     if (isTruncWithZeroHighBitsInput(Cond, DAG))
18797         Cond = Cond.getOperand(0);
18798
18799     // We know the result is compared against zero. Try to match it to BT.
18800     if (Cond.hasOneUse()) {
18801       if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18802         CC = NewSetCC.getOperand(0);
18803         Cond = NewSetCC.getOperand(1);
18804         addTest = false;
18805       }
18806     }
18807   }
18808
18809   if (addTest) {
18810     X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18811     CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18812     Cond = EmitTest(Cond, X86Cond, dl, DAG);
18813   }
18814   Cond = ConvertCmpIfNecessary(Cond, DAG);
18815   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18816                      Chain, Dest, CC, Cond);
18817 }
18818
18819 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18820 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18821 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18822 // that the guard pages used by the OS virtual memory manager are allocated in
18823 // correct sequence.
18824 SDValue
18825 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18826                                            SelectionDAG &DAG) const {
18827   MachineFunction &MF = DAG.getMachineFunction();
18828   bool SplitStack = MF.shouldSplitStack();
18829   bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18830   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18831                SplitStack || EmitStackProbe;
18832   SDLoc dl(Op);
18833
18834   // Get the inputs.
18835   SDNode *Node = Op.getNode();
18836   SDValue Chain = Op.getOperand(0);
18837   SDValue Size  = Op.getOperand(1);
18838   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18839   EVT VT = Node->getValueType(0);
18840
18841   // Chain the dynamic stack allocation so that it doesn't modify the stack
18842   // pointer when other instructions are using the stack.
18843   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18844
18845   bool Is64Bit = Subtarget.is64Bit();
18846   MVT SPTy = getPointerTy(DAG.getDataLayout());
18847
18848   SDValue Result;
18849   if (!Lower) {
18850     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18851     unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18852     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18853                     " not tell us which reg is the stack pointer!");
18854
18855     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18856     Chain = SP.getValue(1);
18857     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18858     unsigned StackAlign = TFI.getStackAlignment();
18859     Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18860     if (Align > StackAlign)
18861       Result = DAG.getNode(ISD::AND, dl, VT, Result,
18862                          DAG.getConstant(-(uint64_t)Align, dl, VT));
18863     Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18864   } else if (SplitStack) {
18865     MachineRegisterInfo &MRI = MF.getRegInfo();
18866
18867     if (Is64Bit) {
18868       // The 64 bit implementation of segmented stacks needs to clobber both r10
18869       // r11. This makes it impossible to use it along with nested parameters.
18870       const Function *F = MF.getFunction();
18871       for (const auto &A : F->args()) {
18872         if (A.hasNestAttr())
18873           report_fatal_error("Cannot use segmented stacks with functions that "
18874                              "have nested arguments.");
18875       }
18876     }
18877
18878     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18879     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18880     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18881     Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18882                                 DAG.getRegister(Vreg, SPTy));
18883   } else {
18884     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18885     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18886     MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18887
18888     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18889     unsigned SPReg = RegInfo->getStackRegister();
18890     SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18891     Chain = SP.getValue(1);
18892
18893     if (Align) {
18894       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18895                        DAG.getConstant(-(uint64_t)Align, dl, VT));
18896       Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18897     }
18898
18899     Result = SP;
18900   }
18901
18902   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18903                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18904
18905   SDValue Ops[2] = {Result, Chain};
18906   return DAG.getMergeValues(Ops, dl);
18907 }
18908
18909 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18910   MachineFunction &MF = DAG.getMachineFunction();
18911   auto PtrVT = getPointerTy(MF.getDataLayout());
18912   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18913
18914   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18915   SDLoc DL(Op);
18916
18917   if (!Subtarget.is64Bit() ||
18918       Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18919     // vastart just stores the address of the VarArgsFrameIndex slot into the
18920     // memory location argument.
18921     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18922     return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18923                         MachinePointerInfo(SV));
18924   }
18925
18926   // __va_list_tag:
18927   //   gp_offset         (0 - 6 * 8)
18928   //   fp_offset         (48 - 48 + 8 * 16)
18929   //   overflow_arg_area (point to parameters coming in memory).
18930   //   reg_save_area
18931   SmallVector<SDValue, 8> MemOps;
18932   SDValue FIN = Op.getOperand(1);
18933   // Store gp_offset
18934   SDValue Store = DAG.getStore(
18935       Op.getOperand(0), DL,
18936       DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18937       MachinePointerInfo(SV));
18938   MemOps.push_back(Store);
18939
18940   // Store fp_offset
18941   FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18942   Store = DAG.getStore(
18943       Op.getOperand(0), DL,
18944       DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18945       MachinePointerInfo(SV, 4));
18946   MemOps.push_back(Store);
18947
18948   // Store ptr to overflow_arg_area
18949   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18950   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18951   Store =
18952       DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18953   MemOps.push_back(Store);
18954
18955   // Store ptr to reg_save_area.
18956   FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18957       Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18958   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18959   Store = DAG.getStore(
18960       Op.getOperand(0), DL, RSFIN, FIN,
18961       MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18962   MemOps.push_back(Store);
18963   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18964 }
18965
18966 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18967   assert(Subtarget.is64Bit() &&
18968          "LowerVAARG only handles 64-bit va_arg!");
18969   assert(Op.getNumOperands() == 4);
18970
18971   MachineFunction &MF = DAG.getMachineFunction();
18972   if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18973     // The Win64 ABI uses char* instead of a structure.
18974     return DAG.expandVAArg(Op.getNode());
18975
18976   SDValue Chain = Op.getOperand(0);
18977   SDValue SrcPtr = Op.getOperand(1);
18978   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18979   unsigned Align = Op.getConstantOperandVal(3);
18980   SDLoc dl(Op);
18981
18982   EVT ArgVT = Op.getNode()->getValueType(0);
18983   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18984   uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18985   uint8_t ArgMode;
18986
18987   // Decide which area this value should be read from.
18988   // TODO: Implement the AMD64 ABI in its entirety. This simple
18989   // selection mechanism works only for the basic types.
18990   if (ArgVT == MVT::f80) {
18991     llvm_unreachable("va_arg for f80 not yet implemented");
18992   } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18993     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
18994   } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18995     ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
18996   } else {
18997     llvm_unreachable("Unhandled argument type in LowerVAARG");
18998   }
18999
19000   if (ArgMode == 2) {
19001     // Sanity Check: Make sure using fp_offset makes sense.
19002     assert(!Subtarget.useSoftFloat() &&
19003            !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
19004            Subtarget.hasSSE1());
19005   }
19006
19007   // Insert VAARG_64 node into the DAG
19008   // VAARG_64 returns two values: Variable Argument Address, Chain
19009   SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19010                        DAG.getConstant(ArgMode, dl, MVT::i8),
19011                        DAG.getConstant(Align, dl, MVT::i32)};
19012   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19013   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
19014                                           VTs, InstOps, MVT::i64,
19015                                           MachinePointerInfo(SV),
19016                                           /*Align=*/0,
19017                                           /*Volatile=*/false,
19018                                           /*ReadMem=*/true,
19019                                           /*WriteMem=*/true);
19020   Chain = VAARG.getValue(1);
19021
19022   // Load the next argument and return it
19023   return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19024 }
19025
19026 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19027                            SelectionDAG &DAG) {
19028   // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19029   // where a va_list is still an i8*.
19030   assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19031   if (Subtarget.isCallingConvWin64(
19032         DAG.getMachineFunction().getFunction()->getCallingConv()))
19033     // Probably a Win64 va_copy.
19034     return DAG.expandVACopy(Op.getNode());
19035
19036   SDValue Chain = Op.getOperand(0);
19037   SDValue DstPtr = Op.getOperand(1);
19038   SDValue SrcPtr = Op.getOperand(2);
19039   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19040   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19041   SDLoc DL(Op);
19042
19043   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19044                        DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19045                        false, false,
19046                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19047 }
19048
19049 /// Handle vector element shifts where the shift amount is a constant.
19050 /// Takes immediate version of shift as input.
19051 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19052                                           SDValue SrcOp, uint64_t ShiftAmt,
19053                                           SelectionDAG &DAG) {
19054   MVT ElementType = VT.getVectorElementType();
19055
19056   // Bitcast the source vector to the output type, this is mainly necessary for
19057   // vXi8/vXi64 shifts.
19058   if (VT != SrcOp.getSimpleValueType())
19059     SrcOp = DAG.getBitcast(VT, SrcOp);
19060
19061   // Fold this packed shift into its first operand if ShiftAmt is 0.
19062   if (ShiftAmt == 0)
19063     return SrcOp;
19064
19065   // Check for ShiftAmt >= element width
19066   if (ShiftAmt >= ElementType.getSizeInBits()) {
19067     if (Opc == X86ISD::VSRAI)
19068       ShiftAmt = ElementType.getSizeInBits() - 1;
19069     else
19070       return DAG.getConstant(0, dl, VT);
19071   }
19072
19073   assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19074          && "Unknown target vector shift-by-constant node");
19075
19076   // Fold this packed vector shift into a build vector if SrcOp is a
19077   // vector of Constants or UNDEFs.
19078   if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19079     SmallVector<SDValue, 8> Elts;
19080     unsigned NumElts = SrcOp->getNumOperands();
19081     ConstantSDNode *ND;
19082
19083     switch(Opc) {
19084     default: llvm_unreachable("Unknown opcode!");
19085     case X86ISD::VSHLI:
19086       for (unsigned i=0; i!=NumElts; ++i) {
19087         SDValue CurrentOp = SrcOp->getOperand(i);
19088         if (CurrentOp->isUndef()) {
19089           Elts.push_back(CurrentOp);
19090           continue;
19091         }
19092         ND = cast<ConstantSDNode>(CurrentOp);
19093         const APInt &C = ND->getAPIntValue();
19094         Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19095       }
19096       break;
19097     case X86ISD::VSRLI:
19098       for (unsigned i=0; i!=NumElts; ++i) {
19099         SDValue CurrentOp = SrcOp->getOperand(i);
19100         if (CurrentOp->isUndef()) {
19101           Elts.push_back(CurrentOp);
19102           continue;
19103         }
19104         ND = cast<ConstantSDNode>(CurrentOp);
19105         const APInt &C = ND->getAPIntValue();
19106         Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19107       }
19108       break;
19109     case X86ISD::VSRAI:
19110       for (unsigned i=0; i!=NumElts; ++i) {
19111         SDValue CurrentOp = SrcOp->getOperand(i);
19112         if (CurrentOp->isUndef()) {
19113           Elts.push_back(CurrentOp);
19114           continue;
19115         }
19116         ND = cast<ConstantSDNode>(CurrentOp);
19117         const APInt &C = ND->getAPIntValue();
19118         Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19119       }
19120       break;
19121     }
19122
19123     return DAG.getBuildVector(VT, dl, Elts);
19124   }
19125
19126   return DAG.getNode(Opc, dl, VT, SrcOp,
19127                      DAG.getConstant(ShiftAmt, dl, MVT::i8));
19128 }
19129
19130 /// Handle vector element shifts where the shift amount may or may not be a
19131 /// constant. Takes immediate version of shift as input.
19132 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19133                                    SDValue SrcOp, SDValue ShAmt,
19134                                    const X86Subtarget &Subtarget,
19135                                    SelectionDAG &DAG) {
19136   MVT SVT = ShAmt.getSimpleValueType();
19137   assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19138
19139   // Catch shift-by-constant.
19140   if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19141     return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19142                                       CShAmt->getZExtValue(), DAG);
19143
19144   // Change opcode to non-immediate version
19145   switch (Opc) {
19146     default: llvm_unreachable("Unknown target vector shift node");
19147     case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19148     case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19149     case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19150   }
19151
19152   // Need to build a vector containing shift amount.
19153   // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19154   // +=================+============+=======================================+
19155   // | ShAmt is        | HasSSE4.1? | Construct ShAmt vector as             |
19156   // +=================+============+=======================================+
19157   // | i64             | Yes, No    | Use ShAmt as lowest elt               |
19158   // | i32             | Yes        | zero-extend in-reg                    |
19159   // | (i32 zext(i16)) | Yes        | zero-extend in-reg                    |
19160   // | i16/i32         | No         | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19161   // +=================+============+=======================================+
19162
19163   if (SVT == MVT::i64)
19164     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19165   else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19166            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19167     ShAmt = ShAmt.getOperand(0);
19168     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19169     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19170   } else if (Subtarget.hasSSE41() &&
19171              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19172     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19173     ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19174   } else {
19175     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
19176                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19177     ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19178   }
19179
19180   // The return type has to be a 128-bit type with the same element
19181   // type as the input type.
19182   MVT EltVT = VT.getVectorElementType();
19183   MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19184
19185   ShAmt = DAG.getBitcast(ShVT, ShAmt);
19186   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19187 }
19188
19189 /// \brief Return Mask with the necessary casting or extending
19190 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19191 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19192                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
19193                            const SDLoc &dl) {
19194
19195   if (isAllOnesConstant(Mask))
19196     return DAG.getTargetConstant(1, dl, MaskVT);
19197   if (X86::isZeroNode(Mask))
19198     return DAG.getTargetConstant(0, dl, MaskVT);
19199
19200   if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19201     // Mask should be extended
19202     Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19203                        MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19204   }
19205
19206   if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19207     if (MaskVT == MVT::v64i1) {
19208       assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19209       // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19210       SDValue Lo, Hi;
19211       Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19212                           DAG.getConstant(0, dl, MVT::i32));
19213       Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19214                           DAG.getConstant(1, dl, MVT::i32));
19215
19216       Lo = DAG.getBitcast(MVT::v32i1, Lo);
19217       Hi = DAG.getBitcast(MVT::v32i1, Hi);
19218
19219       return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19220     } else {
19221       // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19222       // and bitcast.
19223       MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19224       return DAG.getBitcast(MaskVT,
19225                             DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19226     }
19227
19228   } else {
19229     MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19230                                      Mask.getSimpleValueType().getSizeInBits());
19231     // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19232     // are extracted by EXTRACT_SUBVECTOR.
19233     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19234                        DAG.getBitcast(BitcastVT, Mask),
19235                        DAG.getIntPtrConstant(0, dl));
19236   }
19237 }
19238
19239 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19240 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19241 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19242 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19243                   SDValue PreservedSrc,
19244                   const X86Subtarget &Subtarget,
19245                   SelectionDAG &DAG) {
19246   MVT VT = Op.getSimpleValueType();
19247   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19248   unsigned OpcodeSelect = ISD::VSELECT;
19249   SDLoc dl(Op);
19250
19251   if (isAllOnesConstant(Mask))
19252     return Op;
19253
19254   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19255
19256   switch (Op.getOpcode()) {
19257   default: break;
19258   case X86ISD::PCMPEQM:
19259   case X86ISD::PCMPGTM:
19260   case X86ISD::CMPM:
19261   case X86ISD::CMPMU:
19262     return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19263   case X86ISD::VFPCLASS:
19264     case X86ISD::VFPCLASSS:
19265     return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19266   case X86ISD::VTRUNC:
19267   case X86ISD::VTRUNCS:
19268   case X86ISD::VTRUNCUS:
19269   case X86ISD::CVTPS2PH:
19270     // We can't use ISD::VSELECT here because it is not always "Legal"
19271     // for the destination type. For example vpmovqb require only AVX512
19272     // and vselect that can operate on byte element type require BWI
19273     OpcodeSelect = X86ISD::SELECT;
19274     break;
19275   }
19276   if (PreservedSrc.isUndef())
19277     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19278   return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19279 }
19280
19281 /// \brief Creates an SDNode for a predicated scalar operation.
19282 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19283 /// The mask is coming as MVT::i8 and it should be transformed
19284 /// to MVT::v1i1 while lowering masking intrinsics.
19285 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19286 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19287 /// for a scalar instruction.
19288 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19289                                     SDValue PreservedSrc,
19290                                     const X86Subtarget &Subtarget,
19291                                     SelectionDAG &DAG) {
19292
19293   if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19294     if (MaskConst->getZExtValue() & 0x1)
19295       return Op;
19296
19297   MVT VT = Op.getSimpleValueType();
19298   SDLoc dl(Op);
19299
19300   SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19301   if (Op.getOpcode() == X86ISD::FSETCCM ||
19302       Op.getOpcode() == X86ISD::FSETCCM_RND)
19303     return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19304   if (Op.getOpcode() == X86ISD::VFPCLASSS)
19305     return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19306
19307   if (PreservedSrc.isUndef())
19308     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19309   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19310 }
19311
19312 static int getSEHRegistrationNodeSize(const Function *Fn) {
19313   if (!Fn->hasPersonalityFn())
19314     report_fatal_error(
19315         "querying registration node size for function without personality");
19316   // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19317   // WinEHStatePass for the full struct definition.
19318   switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19319   case EHPersonality::MSVC_X86SEH: return 24;
19320   case EHPersonality::MSVC_CXX: return 16;
19321   default: break;
19322   }
19323   report_fatal_error(
19324       "can only recover FP for 32-bit MSVC EH personality functions");
19325 }
19326
19327 /// When the MSVC runtime transfers control to us, either to an outlined
19328 /// function or when returning to a parent frame after catching an exception, we
19329 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19330 /// Here's the math:
19331 ///   RegNodeBase = EntryEBP - RegNodeSize
19332 ///   ParentFP = RegNodeBase - ParentFrameOffset
19333 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19334 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19335 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19336                                    SDValue EntryEBP) {
19337   MachineFunction &MF = DAG.getMachineFunction();
19338   SDLoc dl;
19339
19340   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19341   MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19342
19343   // It's possible that the parent function no longer has a personality function
19344   // if the exceptional code was optimized away, in which case we just return
19345   // the incoming EBP.
19346   if (!Fn->hasPersonalityFn())
19347     return EntryEBP;
19348
19349   // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19350   // registration, or the .set_setframe offset.
19351   MCSymbol *OffsetSym =
19352       MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19353           GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19354   SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19355   SDValue ParentFrameOffset =
19356       DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19357
19358   // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19359   // prologue to RBP in the parent function.
19360   const X86Subtarget &Subtarget =
19361       static_cast<const X86Subtarget &>(DAG.getSubtarget());
19362   if (Subtarget.is64Bit())
19363     return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19364
19365   int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19366   // RegNodeBase = EntryEBP - RegNodeSize
19367   // ParentFP = RegNodeBase - ParentFrameOffset
19368   SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19369                                     DAG.getConstant(RegNodeSize, dl, PtrVT));
19370   return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19371 }
19372
19373 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19374                                        SelectionDAG &DAG) {
19375   // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19376   auto isRoundModeCurDirection = [](SDValue Rnd) {
19377     if (!isa<ConstantSDNode>(Rnd))
19378       return false;
19379
19380     unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19381     return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19382   };
19383
19384   SDLoc dl(Op);
19385   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19386   MVT VT = Op.getSimpleValueType();
19387   const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19388   if (IntrData) {
19389     switch(IntrData->Type) {
19390     case INTR_TYPE_1OP:
19391       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19392     case INTR_TYPE_2OP:
19393       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19394         Op.getOperand(2));
19395     case INTR_TYPE_3OP:
19396       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19397         Op.getOperand(2), Op.getOperand(3));
19398     case INTR_TYPE_4OP:
19399       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19400         Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19401     case INTR_TYPE_1OP_MASK_RM: {
19402       SDValue Src = Op.getOperand(1);
19403       SDValue PassThru = Op.getOperand(2);
19404       SDValue Mask = Op.getOperand(3);
19405       SDValue RoundingMode;
19406       // We always add rounding mode to the Node.
19407       // If the rounding mode is not specified, we add the
19408       // "current direction" mode.
19409       if (Op.getNumOperands() == 4)
19410         RoundingMode =
19411           DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19412       else
19413         RoundingMode = Op.getOperand(4);
19414       assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19415       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19416                                               RoundingMode),
19417                                   Mask, PassThru, Subtarget, DAG);
19418     }
19419     case INTR_TYPE_1OP_MASK: {
19420       SDValue Src = Op.getOperand(1);
19421       SDValue PassThru = Op.getOperand(2);
19422       SDValue Mask = Op.getOperand(3);
19423       // We add rounding mode to the Node when
19424       //   - RM Opcode is specified and
19425       //   - RM is not "current direction".
19426       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19427       if (IntrWithRoundingModeOpcode != 0) {
19428         SDValue Rnd = Op.getOperand(4);
19429         if (!isRoundModeCurDirection(Rnd)) {
19430           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19431                                       dl, Op.getValueType(),
19432                                       Src, Rnd),
19433                                       Mask, PassThru, Subtarget, DAG);
19434         }
19435       }
19436       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19437                                   Mask, PassThru, Subtarget, DAG);
19438     }
19439     case INTR_TYPE_SCALAR_MASK: {
19440       SDValue Src1 = Op.getOperand(1);
19441       SDValue Src2 = Op.getOperand(2);
19442       SDValue passThru = Op.getOperand(3);
19443       SDValue Mask = Op.getOperand(4);
19444       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19445       if (IntrWithRoundingModeOpcode != 0) {
19446         SDValue Rnd = Op.getOperand(5);
19447         if (!isRoundModeCurDirection(Rnd))
19448           return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19449                                                   dl, VT, Src1, Src2, Rnd),
19450                                       Mask, passThru, Subtarget, DAG);
19451       }
19452       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19453                                   Mask, passThru, Subtarget, DAG);
19454     }
19455     case INTR_TYPE_SCALAR_MASK_RM: {
19456       SDValue Src1 = Op.getOperand(1);
19457       SDValue Src2 = Op.getOperand(2);
19458       SDValue Src0 = Op.getOperand(3);
19459       SDValue Mask = Op.getOperand(4);
19460       // There are 2 kinds of intrinsics in this group:
19461       // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19462       // (2) With rounding mode and sae - 7 operands.
19463       if (Op.getNumOperands() == 6) {
19464         SDValue Sae  = Op.getOperand(5);
19465         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19466                                                 Sae),
19467                                     Mask, Src0, Subtarget, DAG);
19468       }
19469       assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19470       SDValue RoundingMode  = Op.getOperand(5);
19471       SDValue Sae  = Op.getOperand(6);
19472       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19473                                               RoundingMode, Sae),
19474                                   Mask, Src0, Subtarget, DAG);
19475     }
19476     case INTR_TYPE_2OP_MASK:
19477     case INTR_TYPE_2OP_IMM8_MASK: {
19478       SDValue Src1 = Op.getOperand(1);
19479       SDValue Src2 = Op.getOperand(2);
19480       SDValue PassThru = Op.getOperand(3);
19481       SDValue Mask = Op.getOperand(4);
19482
19483       if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19484         Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19485
19486       // We specify 2 possible opcodes for intrinsics with rounding modes.
19487       // First, we check if the intrinsic may have non-default rounding mode,
19488       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19489       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19490       if (IntrWithRoundingModeOpcode != 0) {
19491         SDValue Rnd = Op.getOperand(5);
19492         if (!isRoundModeCurDirection(Rnd)) {
19493           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19494                                       dl, Op.getValueType(),
19495                                       Src1, Src2, Rnd),
19496                                       Mask, PassThru, Subtarget, DAG);
19497         }
19498       }
19499       // TODO: Intrinsics should have fast-math-flags to propagate.
19500       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19501                                   Mask, PassThru, Subtarget, DAG);
19502     }
19503     case INTR_TYPE_2OP_MASK_RM: {
19504       SDValue Src1 = Op.getOperand(1);
19505       SDValue Src2 = Op.getOperand(2);
19506       SDValue PassThru = Op.getOperand(3);
19507       SDValue Mask = Op.getOperand(4);
19508       // We specify 2 possible modes for intrinsics, with/without rounding
19509       // modes.
19510       // First, we check if the intrinsic have rounding mode (6 operands),
19511       // if not, we set rounding mode to "current".
19512       SDValue Rnd;
19513       if (Op.getNumOperands() == 6)
19514         Rnd = Op.getOperand(5);
19515       else
19516         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19517       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19518                                               Src1, Src2, Rnd),
19519                                   Mask, PassThru, Subtarget, DAG);
19520     }
19521     case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19522       SDValue Src1 = Op.getOperand(1);
19523       SDValue Src2 = Op.getOperand(2);
19524       SDValue Src3 = Op.getOperand(3);
19525       SDValue PassThru = Op.getOperand(4);
19526       SDValue Mask = Op.getOperand(5);
19527       SDValue Sae  = Op.getOperand(6);
19528
19529       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19530                                               Src2, Src3, Sae),
19531                                   Mask, PassThru, Subtarget, DAG);
19532     }
19533     case INTR_TYPE_3OP_MASK_RM: {
19534       SDValue Src1 = Op.getOperand(1);
19535       SDValue Src2 = Op.getOperand(2);
19536       SDValue Imm = Op.getOperand(3);
19537       SDValue PassThru = Op.getOperand(4);
19538       SDValue Mask = Op.getOperand(5);
19539       // We specify 2 possible modes for intrinsics, with/without rounding
19540       // modes.
19541       // First, we check if the intrinsic have rounding mode (7 operands),
19542       // if not, we set rounding mode to "current".
19543       SDValue Rnd;
19544       if (Op.getNumOperands() == 7)
19545         Rnd = Op.getOperand(6);
19546       else
19547         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19548       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19549                                               Src1, Src2, Imm, Rnd),
19550                                   Mask, PassThru, Subtarget, DAG);
19551     }
19552     case INTR_TYPE_3OP_IMM8_MASK:
19553     case INTR_TYPE_3OP_MASK: {
19554       SDValue Src1 = Op.getOperand(1);
19555       SDValue Src2 = Op.getOperand(2);
19556       SDValue Src3 = Op.getOperand(3);
19557       SDValue PassThru = Op.getOperand(4);
19558       SDValue Mask = Op.getOperand(5);
19559
19560       if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19561         Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19562
19563       // We specify 2 possible opcodes for intrinsics with rounding modes.
19564       // First, we check if the intrinsic may have non-default rounding mode,
19565       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19566       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19567       if (IntrWithRoundingModeOpcode != 0) {
19568         SDValue Rnd = Op.getOperand(6);
19569         if (!isRoundModeCurDirection(Rnd)) {
19570           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19571                                       dl, Op.getValueType(),
19572                                       Src1, Src2, Src3, Rnd),
19573                                       Mask, PassThru, Subtarget, DAG);
19574         }
19575       }
19576       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19577                                               Src1, Src2, Src3),
19578                                   Mask, PassThru, Subtarget, DAG);
19579     }
19580     case VPERM_2OP_MASK : {
19581       SDValue Src1 = Op.getOperand(1);
19582       SDValue Src2 = Op.getOperand(2);
19583       SDValue PassThru = Op.getOperand(3);
19584       SDValue Mask = Op.getOperand(4);
19585
19586       // Swap Src1 and Src2 in the node creation
19587       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19588                                   Mask, PassThru, Subtarget, DAG);
19589     }
19590     case VPERM_3OP_MASKZ:
19591     case VPERM_3OP_MASK:{
19592       MVT VT = Op.getSimpleValueType();
19593       // Src2 is the PassThru
19594       SDValue Src1 = Op.getOperand(1);
19595       // PassThru needs to be the same type as the destination in order
19596       // to pattern match correctly.
19597       SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19598       SDValue Src3 = Op.getOperand(3);
19599       SDValue Mask = Op.getOperand(4);
19600       SDValue PassThru = SDValue();
19601
19602       // set PassThru element
19603       if (IntrData->Type == VPERM_3OP_MASKZ)
19604         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19605       else
19606         PassThru = Src2;
19607
19608       // Swap Src1 and Src2 in the node creation
19609       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19610                                               dl, Op.getValueType(),
19611                                               Src2, Src1, Src3),
19612                                   Mask, PassThru, Subtarget, DAG);
19613     }
19614     case FMA_OP_MASK3:
19615     case FMA_OP_MASKZ:
19616     case FMA_OP_MASK: {
19617       SDValue Src1 = Op.getOperand(1);
19618       SDValue Src2 = Op.getOperand(2);
19619       SDValue Src3 = Op.getOperand(3);
19620       SDValue Mask = Op.getOperand(4);
19621       MVT VT = Op.getSimpleValueType();
19622       SDValue PassThru = SDValue();
19623
19624       // set PassThru element
19625       if (IntrData->Type == FMA_OP_MASKZ)
19626         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19627       else if (IntrData->Type == FMA_OP_MASK3)
19628         PassThru = Src3;
19629       else
19630         PassThru = Src1;
19631
19632       // We specify 2 possible opcodes for intrinsics with rounding modes.
19633       // First, we check if the intrinsic may have non-default rounding mode,
19634       // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19635       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19636       if (IntrWithRoundingModeOpcode != 0) {
19637         SDValue Rnd = Op.getOperand(5);
19638         if (!isRoundModeCurDirection(Rnd))
19639           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19640                                                   dl, Op.getValueType(),
19641                                                   Src1, Src2, Src3, Rnd),
19642                                       Mask, PassThru, Subtarget, DAG);
19643       }
19644       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19645                                               dl, Op.getValueType(),
19646                                               Src1, Src2, Src3),
19647                                   Mask, PassThru, Subtarget, DAG);
19648     }
19649     case FMA_OP_SCALAR_MASK:
19650     case FMA_OP_SCALAR_MASK3:
19651     case FMA_OP_SCALAR_MASKZ: {
19652       SDValue Src1 = Op.getOperand(1);
19653       SDValue Src2 = Op.getOperand(2);
19654       SDValue Src3 = Op.getOperand(3);
19655       SDValue Mask = Op.getOperand(4);
19656       MVT VT = Op.getSimpleValueType();
19657       SDValue PassThru = SDValue();
19658
19659       // set PassThru element
19660       if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19661         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19662       else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19663         PassThru = Src3;
19664       else
19665         PassThru = Src1;
19666
19667       SDValue Rnd = Op.getOperand(5);
19668       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19669                                               Op.getValueType(), Src1, Src2,
19670                                               Src3, Rnd),
19671                                   Mask, PassThru, Subtarget, DAG);
19672     }
19673     case TERLOG_OP_MASK:
19674     case TERLOG_OP_MASKZ: {
19675       SDValue Src1 = Op.getOperand(1);
19676       SDValue Src2 = Op.getOperand(2);
19677       SDValue Src3 = Op.getOperand(3);
19678       SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19679       SDValue Mask = Op.getOperand(5);
19680       MVT VT = Op.getSimpleValueType();
19681       SDValue PassThru = Src1;
19682       // Set PassThru element.
19683       if (IntrData->Type == TERLOG_OP_MASKZ)
19684         PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19685
19686       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19687                                               Src1, Src2, Src3, Src4),
19688                                   Mask, PassThru, Subtarget, DAG);
19689     }
19690     case CVTPD2PS:
19691       // ISD::FP_ROUND has a second argument that indicates if the truncation
19692       // does not change the value. Set it to 0 since it can change.
19693       return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19694                          DAG.getIntPtrConstant(0, dl));
19695     case CVTPD2PS_MASK: {
19696       SDValue Src = Op.getOperand(1);
19697       SDValue PassThru = Op.getOperand(2);
19698       SDValue Mask = Op.getOperand(3);
19699       // We add rounding mode to the Node when
19700       //   - RM Opcode is specified and
19701       //   - RM is not "current direction".
19702       unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19703       if (IntrWithRoundingModeOpcode != 0) {
19704         SDValue Rnd = Op.getOperand(4);
19705         if (!isRoundModeCurDirection(Rnd)) {
19706           return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19707                                       dl, Op.getValueType(),
19708                                       Src, Rnd),
19709                                       Mask, PassThru, Subtarget, DAG);
19710         }
19711       }
19712       assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19713       // ISD::FP_ROUND has a second argument that indicates if the truncation
19714       // does not change the value. Set it to 0 since it can change.
19715       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19716                                               DAG.getIntPtrConstant(0, dl)),
19717                                   Mask, PassThru, Subtarget, DAG);
19718     }
19719     case FPCLASS: {
19720       // FPclass intrinsics with mask
19721        SDValue Src1 = Op.getOperand(1);
19722        MVT VT = Src1.getSimpleValueType();
19723        MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19724        SDValue Imm = Op.getOperand(2);
19725        SDValue Mask = Op.getOperand(3);
19726        MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19727                                      Mask.getSimpleValueType().getSizeInBits());
19728        SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19729        SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19730                                                  DAG.getTargetConstant(0, dl, MaskVT),
19731                                                  Subtarget, DAG);
19732        SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19733                                  DAG.getUNDEF(BitcastVT), FPclassMask,
19734                                  DAG.getIntPtrConstant(0, dl));
19735        return DAG.getBitcast(Op.getValueType(), Res);
19736     }
19737     case FPCLASSS: {
19738       SDValue Src1 = Op.getOperand(1);
19739       SDValue Imm = Op.getOperand(2);
19740       SDValue Mask = Op.getOperand(3);
19741       SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19742       SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19743         DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19744       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19745                          DAG.getIntPtrConstant(0, dl));
19746     }
19747     case CMP_MASK:
19748     case CMP_MASK_CC: {
19749       // Comparison intrinsics with masks.
19750       // Example of transformation:
19751       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19752       //             (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19753       // (i8 (bitcast
19754       //   (v8i1 (insert_subvector undef,
19755       //           (v2i1 (and (PCMPEQM %a, %b),
19756       //                      (extract_subvector
19757       //                         (v8i1 (bitcast %mask)), 0))), 0))))
19758       MVT VT = Op.getOperand(1).getSimpleValueType();
19759       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19760       SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19761       MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19762                                        Mask.getSimpleValueType().getSizeInBits());
19763       SDValue Cmp;
19764       if (IntrData->Type == CMP_MASK_CC) {
19765         SDValue CC = Op.getOperand(3);
19766         CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19767         // We specify 2 possible opcodes for intrinsics with rounding modes.
19768         // First, we check if the intrinsic may have non-default rounding mode,
19769         // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19770         if (IntrData->Opc1 != 0) {
19771           SDValue Rnd = Op.getOperand(5);
19772           if (!isRoundModeCurDirection(Rnd))
19773             Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19774                               Op.getOperand(2), CC, Rnd);
19775         }
19776         //default rounding mode
19777         if(!Cmp.getNode())
19778             Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19779                               Op.getOperand(2), CC);
19780
19781       } else {
19782         assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19783         Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19784                           Op.getOperand(2));
19785       }
19786       SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19787                                              DAG.getTargetConstant(0, dl,
19788                                                                    MaskVT),
19789                                              Subtarget, DAG);
19790       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19791                                 DAG.getUNDEF(BitcastVT), CmpMask,
19792                                 DAG.getIntPtrConstant(0, dl));
19793       return DAG.getBitcast(Op.getValueType(), Res);
19794     }
19795     case CMP_MASK_SCALAR_CC: {
19796       SDValue Src1 = Op.getOperand(1);
19797       SDValue Src2 = Op.getOperand(2);
19798       SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19799       SDValue Mask = Op.getOperand(4);
19800
19801       SDValue Cmp;
19802       if (IntrData->Opc1 != 0) {
19803         SDValue Rnd = Op.getOperand(5);
19804         if (!isRoundModeCurDirection(Rnd))
19805           Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19806       }
19807       //default rounding mode
19808       if(!Cmp.getNode())
19809         Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19810
19811       SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19812                                              DAG.getTargetConstant(0, dl,
19813                                                                    MVT::i1),
19814                                              Subtarget, DAG);
19815       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19816                          DAG.getIntPtrConstant(0, dl));
19817     }
19818     case COMI: { // Comparison intrinsics
19819       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19820       SDValue LHS = Op.getOperand(1);
19821       SDValue RHS = Op.getOperand(2);
19822       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19823       SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19824       SDValue SetCC;
19825       switch (CC) {
19826       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19827         SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19828         SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19829         SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19830         break;
19831       }
19832       case ISD::SETNE: { // (ZF = 1 or PF = 1)
19833         SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19834         SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19835         SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19836         break;
19837       }
19838       case ISD::SETGT: // (CF = 0 and ZF = 0)
19839         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19840         break;
19841       case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19842         SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19843         break;
19844       }
19845       case ISD::SETGE: // CF = 0
19846         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19847         break;
19848       case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19849         SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19850         break;
19851       default:
19852         llvm_unreachable("Unexpected illegal condition!");
19853       }
19854       return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19855     }
19856     case COMI_RM: { // Comparison intrinsics with Sae
19857       SDValue LHS = Op.getOperand(1);
19858       SDValue RHS = Op.getOperand(2);
19859       unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19860       SDValue Sae = Op.getOperand(4);
19861
19862       SDValue FCmp;
19863       if (isRoundModeCurDirection(Sae))
19864         FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19865                            DAG.getConstant(CondVal, dl, MVT::i8));
19866       else
19867         FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19868                            DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19869       return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19870                          DAG.getIntPtrConstant(0, dl));
19871     }
19872     case VSHIFT:
19873       return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19874                                  Op.getOperand(1), Op.getOperand(2), Subtarget,
19875                                  DAG);
19876     case COMPRESS_EXPAND_IN_REG: {
19877       SDValue Mask = Op.getOperand(3);
19878       SDValue DataToCompress = Op.getOperand(1);
19879       SDValue PassThru = Op.getOperand(2);
19880       if (isAllOnesConstant(Mask)) // return data as is
19881         return Op.getOperand(1);
19882
19883       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19884                                               DataToCompress),
19885                                   Mask, PassThru, Subtarget, DAG);
19886     }
19887     case BROADCASTM: {
19888       SDValue Mask = Op.getOperand(1);
19889       MVT MaskVT = MVT::getVectorVT(MVT::i1,
19890                                     Mask.getSimpleValueType().getSizeInBits());
19891       Mask = DAG.getBitcast(MaskVT, Mask);
19892       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19893     }
19894     case KUNPCK: {
19895       MVT VT = Op.getSimpleValueType();
19896       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19897
19898       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19899       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19900       // Arguments should be swapped.
19901       SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19902                                 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19903                                 Src2, Src1);
19904       return DAG.getBitcast(VT, Res);
19905     }
19906     case MASK_BINOP: {
19907       MVT VT = Op.getSimpleValueType();
19908       MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19909
19910       SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19911       SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19912       SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19913       return DAG.getBitcast(VT, Res);
19914     }
19915     case FIXUPIMMS:
19916     case FIXUPIMMS_MASKZ:
19917     case FIXUPIMM:
19918     case FIXUPIMM_MASKZ:{
19919       SDValue Src1 = Op.getOperand(1);
19920       SDValue Src2 = Op.getOperand(2);
19921       SDValue Src3 = Op.getOperand(3);
19922       SDValue Imm = Op.getOperand(4);
19923       SDValue Mask = Op.getOperand(5);
19924       SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19925                                          Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19926       // We specify 2 possible modes for intrinsics, with/without rounding
19927       // modes.
19928       // First, we check if the intrinsic have rounding mode (7 operands),
19929       // if not, we set rounding mode to "current".
19930       SDValue Rnd;
19931       if (Op.getNumOperands() == 7)
19932         Rnd = Op.getOperand(6);
19933       else
19934         Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19935       if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19936         return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19937                                                 Src1, Src2, Src3, Imm, Rnd),
19938                                     Mask, Passthru, Subtarget, DAG);
19939       else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19940         return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19941                                        Src1, Src2, Src3, Imm, Rnd),
19942                                     Mask, Passthru, Subtarget, DAG);
19943     }
19944     case CONVERT_TO_MASK: {
19945       MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19946       MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19947       MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19948
19949       SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19950                                     Op.getOperand(1));
19951       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19952                                 DAG.getUNDEF(BitcastVT), CvtMask,
19953                                 DAG.getIntPtrConstant(0, dl));
19954       return DAG.getBitcast(Op.getValueType(), Res);
19955     }
19956     case BRCST_SUBVEC_TO_VEC: {
19957       SDValue Src = Op.getOperand(1);
19958       SDValue Passthru = Op.getOperand(2);
19959       SDValue Mask = Op.getOperand(3);
19960       EVT resVT = Passthru.getValueType();
19961       SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19962                                        DAG.getUNDEF(resVT), Src,
19963                                        DAG.getIntPtrConstant(0, dl));
19964       SDValue immVal;
19965       if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19966         immVal = DAG.getConstant(0x44, dl, MVT::i8);
19967       else
19968         immVal = DAG.getConstant(0, dl, MVT::i8);
19969       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19970                                               subVec, subVec, immVal),
19971                                   Mask, Passthru, Subtarget, DAG);
19972     }
19973     case BRCST32x2_TO_VEC: {
19974       SDValue Src = Op.getOperand(1);
19975       SDValue PassThru = Op.getOperand(2);
19976       SDValue Mask = Op.getOperand(3);
19977
19978       assert((VT.getScalarType() == MVT::i32 ||
19979               VT.getScalarType() == MVT::f32) && "Unexpected type!");
19980       //bitcast Src to packed 64
19981       MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19982       MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19983       Src = DAG.getBitcast(BitcastVT, Src);
19984
19985       return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19986                                   Mask, PassThru, Subtarget, DAG);
19987     }
19988     default:
19989       break;
19990     }
19991   }
19992
19993   switch (IntNo) {
19994   default: return SDValue();    // Don't custom lower most intrinsics.
19995
19996   case Intrinsic::x86_avx2_permd:
19997   case Intrinsic::x86_avx2_permps:
19998     // Operands intentionally swapped. Mask is last operand to intrinsic,
19999     // but second operand for node/instruction.
20000     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20001                        Op.getOperand(2), Op.getOperand(1));
20002
20003   // ptest and testp intrinsics. The intrinsic these come from are designed to
20004   // return an integer value, not just an instruction so lower it to the ptest
20005   // or testp pattern and a setcc for the result.
20006   case Intrinsic::x86_sse41_ptestz:
20007   case Intrinsic::x86_sse41_ptestc:
20008   case Intrinsic::x86_sse41_ptestnzc:
20009   case Intrinsic::x86_avx_ptestz_256:
20010   case Intrinsic::x86_avx_ptestc_256:
20011   case Intrinsic::x86_avx_ptestnzc_256:
20012   case Intrinsic::x86_avx_vtestz_ps:
20013   case Intrinsic::x86_avx_vtestc_ps:
20014   case Intrinsic::x86_avx_vtestnzc_ps:
20015   case Intrinsic::x86_avx_vtestz_pd:
20016   case Intrinsic::x86_avx_vtestc_pd:
20017   case Intrinsic::x86_avx_vtestnzc_pd:
20018   case Intrinsic::x86_avx_vtestz_ps_256:
20019   case Intrinsic::x86_avx_vtestc_ps_256:
20020   case Intrinsic::x86_avx_vtestnzc_ps_256:
20021   case Intrinsic::x86_avx_vtestz_pd_256:
20022   case Intrinsic::x86_avx_vtestc_pd_256:
20023   case Intrinsic::x86_avx_vtestnzc_pd_256: {
20024     bool IsTestPacked = false;
20025     X86::CondCode X86CC;
20026     switch (IntNo) {
20027     default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20028     case Intrinsic::x86_avx_vtestz_ps:
20029     case Intrinsic::x86_avx_vtestz_pd:
20030     case Intrinsic::x86_avx_vtestz_ps_256:
20031     case Intrinsic::x86_avx_vtestz_pd_256:
20032       IsTestPacked = true;
20033       LLVM_FALLTHROUGH;
20034     case Intrinsic::x86_sse41_ptestz:
20035     case Intrinsic::x86_avx_ptestz_256:
20036       // ZF = 1
20037       X86CC = X86::COND_E;
20038       break;
20039     case Intrinsic::x86_avx_vtestc_ps:
20040     case Intrinsic::x86_avx_vtestc_pd:
20041     case Intrinsic::x86_avx_vtestc_ps_256:
20042     case Intrinsic::x86_avx_vtestc_pd_256:
20043       IsTestPacked = true;
20044       LLVM_FALLTHROUGH;
20045     case Intrinsic::x86_sse41_ptestc:
20046     case Intrinsic::x86_avx_ptestc_256:
20047       // CF = 1
20048       X86CC = X86::COND_B;
20049       break;
20050     case Intrinsic::x86_avx_vtestnzc_ps:
20051     case Intrinsic::x86_avx_vtestnzc_pd:
20052     case Intrinsic::x86_avx_vtestnzc_ps_256:
20053     case Intrinsic::x86_avx_vtestnzc_pd_256:
20054       IsTestPacked = true;
20055       LLVM_FALLTHROUGH;
20056     case Intrinsic::x86_sse41_ptestnzc:
20057     case Intrinsic::x86_avx_ptestnzc_256:
20058       // ZF and CF = 0
20059       X86CC = X86::COND_A;
20060       break;
20061     }
20062
20063     SDValue LHS = Op.getOperand(1);
20064     SDValue RHS = Op.getOperand(2);
20065     unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20066     SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20067     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20068     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20069   }
20070   case Intrinsic::x86_avx512_kortestz_w:
20071   case Intrinsic::x86_avx512_kortestc_w: {
20072     X86::CondCode X86CC =
20073         (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20074     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20075     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20076     SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20077     SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20078     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20079   }
20080
20081   case Intrinsic::x86_avx512_knot_w: {
20082     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20083     SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20084     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20085     return DAG.getBitcast(MVT::i16, Res);
20086   }
20087
20088   case Intrinsic::x86_avx512_kandn_w: {
20089     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20090     // Invert LHS for the not.
20091     LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20092                       DAG.getConstant(1, dl, MVT::v16i1));
20093     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20094     SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20095     return DAG.getBitcast(MVT::i16, Res);
20096   }
20097
20098   case Intrinsic::x86_avx512_kxnor_w: {
20099     SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20100     SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20101     SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20102     // Invert result for the not.
20103     Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20104                       DAG.getConstant(1, dl, MVT::v16i1));
20105     return DAG.getBitcast(MVT::i16, Res);
20106   }
20107
20108   case Intrinsic::x86_sse42_pcmpistria128:
20109   case Intrinsic::x86_sse42_pcmpestria128:
20110   case Intrinsic::x86_sse42_pcmpistric128:
20111   case Intrinsic::x86_sse42_pcmpestric128:
20112   case Intrinsic::x86_sse42_pcmpistrio128:
20113   case Intrinsic::x86_sse42_pcmpestrio128:
20114   case Intrinsic::x86_sse42_pcmpistris128:
20115   case Intrinsic::x86_sse42_pcmpestris128:
20116   case Intrinsic::x86_sse42_pcmpistriz128:
20117   case Intrinsic::x86_sse42_pcmpestriz128: {
20118     unsigned Opcode;
20119     X86::CondCode X86CC;
20120     switch (IntNo) {
20121     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
20122     case Intrinsic::x86_sse42_pcmpistria128:
20123       Opcode = X86ISD::PCMPISTRI;
20124       X86CC = X86::COND_A;
20125       break;
20126     case Intrinsic::x86_sse42_pcmpestria128:
20127       Opcode = X86ISD::PCMPESTRI;
20128       X86CC = X86::COND_A;
20129       break;
20130     case Intrinsic::x86_sse42_pcmpistric128:
20131       Opcode = X86ISD::PCMPISTRI;
20132       X86CC = X86::COND_B;
20133       break;
20134     case Intrinsic::x86_sse42_pcmpestric128:
20135       Opcode = X86ISD::PCMPESTRI;
20136       X86CC = X86::COND_B;
20137       break;
20138     case Intrinsic::x86_sse42_pcmpistrio128:
20139       Opcode = X86ISD::PCMPISTRI;
20140       X86CC = X86::COND_O;
20141       break;
20142     case Intrinsic::x86_sse42_pcmpestrio128:
20143       Opcode = X86ISD::PCMPESTRI;
20144       X86CC = X86::COND_O;
20145       break;
20146     case Intrinsic::x86_sse42_pcmpistris128:
20147       Opcode = X86ISD::PCMPISTRI;
20148       X86CC = X86::COND_S;
20149       break;
20150     case Intrinsic::x86_sse42_pcmpestris128:
20151       Opcode = X86ISD::PCMPESTRI;
20152       X86CC = X86::COND_S;
20153       break;
20154     case Intrinsic::x86_sse42_pcmpistriz128:
20155       Opcode = X86ISD::PCMPISTRI;
20156       X86CC = X86::COND_E;
20157       break;
20158     case Intrinsic::x86_sse42_pcmpestriz128:
20159       Opcode = X86ISD::PCMPESTRI;
20160       X86CC = X86::COND_E;
20161       break;
20162     }
20163     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20164     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20165     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20166     SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20167     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20168   }
20169
20170   case Intrinsic::x86_sse42_pcmpistri128:
20171   case Intrinsic::x86_sse42_pcmpestri128: {
20172     unsigned Opcode;
20173     if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20174       Opcode = X86ISD::PCMPISTRI;
20175     else
20176       Opcode = X86ISD::PCMPESTRI;
20177
20178     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20179     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20180     return DAG.getNode(Opcode, dl, VTs, NewOps);
20181   }
20182
20183   case Intrinsic::eh_sjlj_lsda: {
20184     MachineFunction &MF = DAG.getMachineFunction();
20185     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20186     MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20187     auto &Context = MF.getMMI().getContext();
20188     MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20189                                             Twine(MF.getFunctionNumber()));
20190     return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20191   }
20192
20193   case Intrinsic::x86_seh_lsda: {
20194     // Compute the symbol for the LSDA. We know it'll get emitted later.
20195     MachineFunction &MF = DAG.getMachineFunction();
20196     SDValue Op1 = Op.getOperand(1);
20197     auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20198     MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20199         GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20200
20201     // Generate a simple absolute symbol reference. This intrinsic is only
20202     // supported on 32-bit Windows, which isn't PIC.
20203     SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20204     return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20205   }
20206
20207   case Intrinsic::x86_seh_recoverfp: {
20208     SDValue FnOp = Op.getOperand(1);
20209     SDValue IncomingFPOp = Op.getOperand(2);
20210     GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20211     auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20212     if (!Fn)
20213       report_fatal_error(
20214           "llvm.x86.seh.recoverfp must take a function as the first argument");
20215     return recoverFramePointer(DAG, Fn, IncomingFPOp);
20216   }
20217
20218   case Intrinsic::localaddress: {
20219     // Returns one of the stack, base, or frame pointer registers, depending on
20220     // which is used to reference local variables.
20221     MachineFunction &MF = DAG.getMachineFunction();
20222     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20223     unsigned Reg;
20224     if (RegInfo->hasBasePointer(MF))
20225       Reg = RegInfo->getBaseRegister();
20226     else // This function handles the SP or FP case.
20227       Reg = RegInfo->getPtrSizedFrameRegister(MF);
20228     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20229   }
20230   }
20231 }
20232
20233 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20234                                  SDValue Src, SDValue Mask, SDValue Base,
20235                                  SDValue Index, SDValue ScaleOp, SDValue Chain,
20236                                  const X86Subtarget &Subtarget) {
20237   SDLoc dl(Op);
20238   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20239   // Scale must be constant.
20240   if (!C)
20241     return SDValue();
20242   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20243   EVT MaskVT = Mask.getValueType();
20244   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20245   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20246   SDValue Segment = DAG.getRegister(0, MVT::i32);
20247   // If source is undef or we know it won't be used, use a zero vector
20248   // to break register dependency.
20249   // TODO: use undef instead and let ExecutionDepsFix deal with it?
20250   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20251     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20252   SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20253   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20254   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20255   return DAG.getMergeValues(RetOps, dl);
20256 }
20257
20258 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20259                               SDValue Src, SDValue Mask, SDValue Base,
20260                               SDValue Index, SDValue ScaleOp, SDValue Chain,
20261                               const X86Subtarget &Subtarget) {
20262   SDLoc dl(Op);
20263   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20264   // Scale must be constant.
20265   if (!C)
20266     return SDValue();
20267   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20268   MVT MaskVT = MVT::getVectorVT(MVT::i1,
20269                              Index.getSimpleValueType().getVectorNumElements());
20270
20271   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20272   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20273   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20274   SDValue Segment = DAG.getRegister(0, MVT::i32);
20275   // If source is undef or we know it won't be used, use a zero vector
20276   // to break register dependency.
20277   // TODO: use undef instead and let ExecutionDepsFix deal with it?
20278   if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20279     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20280   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20281   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20282   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20283   return DAG.getMergeValues(RetOps, dl);
20284 }
20285
20286 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20287                                SDValue Src, SDValue Mask, SDValue Base,
20288                                SDValue Index, SDValue ScaleOp, SDValue Chain,
20289                                const X86Subtarget &Subtarget) {
20290   SDLoc dl(Op);
20291   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20292   // Scale must be constant.
20293   if (!C)
20294     return SDValue();
20295   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20296   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20297   SDValue Segment = DAG.getRegister(0, MVT::i32);
20298   MVT MaskVT = MVT::getVectorVT(MVT::i1,
20299                              Index.getSimpleValueType().getVectorNumElements());
20300
20301   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20302   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20303   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20304   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20305   return SDValue(Res, 1);
20306 }
20307
20308 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20309                                SDValue Mask, SDValue Base, SDValue Index,
20310                                SDValue ScaleOp, SDValue Chain,
20311                                const X86Subtarget &Subtarget) {
20312   SDLoc dl(Op);
20313   auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20314   // Scale must be constant.
20315   if (!C)
20316     return SDValue();
20317   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20318   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20319   SDValue Segment = DAG.getRegister(0, MVT::i32);
20320   MVT MaskVT =
20321     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20322   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20323   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20324   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20325   return SDValue(Res, 0);
20326 }
20327
20328 /// Handles the lowering of builtin intrinsic that return the value
20329 /// of the extended control register.
20330 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20331                                        SelectionDAG &DAG,
20332                                        const X86Subtarget &Subtarget,
20333                                        SmallVectorImpl<SDValue> &Results) {
20334   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20335   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20336   SDValue LO, HI;
20337
20338   // The ECX register is used to select the index of the XCR register to
20339   // return.
20340   SDValue Chain =
20341       DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20342   SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20343   Chain = SDValue(N1, 0);
20344
20345   // Reads the content of XCR and returns it in registers EDX:EAX.
20346   if (Subtarget.is64Bit()) {
20347     LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20348     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20349                             LO.getValue(2));
20350   } else {
20351     LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20352     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20353                             LO.getValue(2));
20354   }
20355   Chain = HI.getValue(1);
20356
20357   if (Subtarget.is64Bit()) {
20358     // Merge the two 32-bit values into a 64-bit one..
20359     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20360                               DAG.getConstant(32, DL, MVT::i8));
20361     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20362     Results.push_back(Chain);
20363     return;
20364   }
20365
20366   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20367   SDValue Ops[] = { LO, HI };
20368   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20369   Results.push_back(Pair);
20370   Results.push_back(Chain);
20371 }
20372
20373 /// Handles the lowering of builtin intrinsics that read performance monitor
20374 /// counters (x86_rdpmc).
20375 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20376                                       SelectionDAG &DAG,
20377                                       const X86Subtarget &Subtarget,
20378                                       SmallVectorImpl<SDValue> &Results) {
20379   assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20380   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20381   SDValue LO, HI;
20382
20383   // The ECX register is used to select the index of the performance counter
20384   // to read.
20385   SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20386                                    N->getOperand(2));
20387   SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20388
20389   // Reads the content of a 64-bit performance counter and returns it in the
20390   // registers EDX:EAX.
20391   if (Subtarget.is64Bit()) {
20392     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20393     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20394                             LO.getValue(2));
20395   } else {
20396     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20397     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20398                             LO.getValue(2));
20399   }
20400   Chain = HI.getValue(1);
20401
20402   if (Subtarget.is64Bit()) {
20403     // The EAX register is loaded with the low-order 32 bits. The EDX register
20404     // is loaded with the supported high-order bits of the counter.
20405     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20406                               DAG.getConstant(32, DL, MVT::i8));
20407     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20408     Results.push_back(Chain);
20409     return;
20410   }
20411
20412   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20413   SDValue Ops[] = { LO, HI };
20414   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20415   Results.push_back(Pair);
20416   Results.push_back(Chain);
20417 }
20418
20419 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20420 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20421 /// READCYCLECOUNTER nodes.
20422 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20423                                     SelectionDAG &DAG,
20424                                     const X86Subtarget &Subtarget,
20425                                     SmallVectorImpl<SDValue> &Results) {
20426   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20427   SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20428   SDValue LO, HI;
20429
20430   // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20431   // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20432   // and the EAX register is loaded with the low-order 32 bits.
20433   if (Subtarget.is64Bit()) {
20434     LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20435     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20436                             LO.getValue(2));
20437   } else {
20438     LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20439     HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20440                             LO.getValue(2));
20441   }
20442   SDValue Chain = HI.getValue(1);
20443
20444   if (Opcode == X86ISD::RDTSCP_DAG) {
20445     assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20446
20447     // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20448     // the ECX register. Add 'ecx' explicitly to the chain.
20449     SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20450                                      HI.getValue(2));
20451     // Explicitly store the content of ECX at the location passed in input
20452     // to the 'rdtscp' intrinsic.
20453     Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20454                          MachinePointerInfo());
20455   }
20456
20457   if (Subtarget.is64Bit()) {
20458     // The EDX register is loaded with the high-order 32 bits of the MSR, and
20459     // the EAX register is loaded with the low-order 32 bits.
20460     SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20461                               DAG.getConstant(32, DL, MVT::i8));
20462     Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20463     Results.push_back(Chain);
20464     return;
20465   }
20466
20467   // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20468   SDValue Ops[] = { LO, HI };
20469   SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20470   Results.push_back(Pair);
20471   Results.push_back(Chain);
20472 }
20473
20474 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20475                                      SelectionDAG &DAG) {
20476   SmallVector<SDValue, 2> Results;
20477   SDLoc DL(Op);
20478   getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20479                           Results);
20480   return DAG.getMergeValues(Results, DL);
20481 }
20482
20483 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20484   MachineFunction &MF = DAG.getMachineFunction();
20485   SDValue Chain = Op.getOperand(0);
20486   SDValue RegNode = Op.getOperand(2);
20487   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20488   if (!EHInfo)
20489     report_fatal_error("EH registrations only live in functions using WinEH");
20490
20491   // Cast the operand to an alloca, and remember the frame index.
20492   auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20493   if (!FINode)
20494     report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20495   EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20496
20497   // Return the chain operand without making any DAG nodes.
20498   return Chain;
20499 }
20500
20501 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20502   MachineFunction &MF = DAG.getMachineFunction();
20503   SDValue Chain = Op.getOperand(0);
20504   SDValue EHGuard = Op.getOperand(2);
20505   WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20506   if (!EHInfo)
20507     report_fatal_error("EHGuard only live in functions using WinEH");
20508
20509   // Cast the operand to an alloca, and remember the frame index.
20510   auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20511   if (!FINode)
20512     report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20513   EHInfo->EHGuardFrameIndex = FINode->getIndex();
20514
20515   // Return the chain operand without making any DAG nodes.
20516   return Chain;
20517 }
20518
20519 /// Emit Truncating Store with signed or unsigned saturation.
20520 static SDValue
20521 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20522                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20523                 SelectionDAG &DAG) {
20524
20525   SDVTList VTs = DAG.getVTList(MVT::Other);
20526   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20527   SDValue Ops[] = { Chain, Val, Ptr, Undef };
20528   return SignedSat ?
20529     DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20530     DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20531 }
20532
20533 /// Emit Masked Truncating Store with signed or unsigned saturation.
20534 static SDValue
20535 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20536                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20537                       MachineMemOperand *MMO, SelectionDAG &DAG) {
20538
20539   SDVTList VTs = DAG.getVTList(MVT::Other);
20540   SDValue Ops[] = { Chain, Ptr, Mask, Val };
20541   return SignedSat ?
20542     DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20543     DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20544 }
20545
20546 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20547                                       SelectionDAG &DAG) {
20548   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20549
20550   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20551   if (!IntrData) {
20552     switch (IntNo) {
20553     case llvm::Intrinsic::x86_seh_ehregnode:
20554       return MarkEHRegistrationNode(Op, DAG);
20555     case llvm::Intrinsic::x86_seh_ehguard:
20556       return MarkEHGuard(Op, DAG);
20557     case llvm::Intrinsic::x86_flags_read_u32:
20558     case llvm::Intrinsic::x86_flags_read_u64:
20559     case llvm::Intrinsic::x86_flags_write_u32:
20560     case llvm::Intrinsic::x86_flags_write_u64: {
20561       // We need a frame pointer because this will get lowered to a PUSH/POP
20562       // sequence.
20563       MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20564       MFI.setHasCopyImplyingStackAdjustment(true);
20565       // Don't do anything here, we will expand these intrinsics out later
20566       // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20567       return SDValue();
20568     }
20569     case Intrinsic::x86_lwpins32:
20570     case Intrinsic::x86_lwpins64: {
20571       SDLoc dl(Op);
20572       SDValue Chain = Op->getOperand(0);
20573       SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20574       SDValue LwpIns =
20575           DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20576                       Op->getOperand(3), Op->getOperand(4));
20577       SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20578       SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20579       return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20580                          LwpIns.getValue(1));
20581     }
20582     }
20583     return SDValue();
20584   }
20585
20586   SDLoc dl(Op);
20587   switch(IntrData->Type) {
20588   default: llvm_unreachable("Unknown Intrinsic Type");
20589   case RDSEED:
20590   case RDRAND: {
20591     // Emit the node with the right value type.
20592     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20593     SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20594
20595     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20596     // Otherwise return the value from Rand, which is always 0, casted to i32.
20597     SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20598                       DAG.getConstant(1, dl, Op->getValueType(1)),
20599                       DAG.getConstant(X86::COND_B, dl, MVT::i32),
20600                       SDValue(Result.getNode(), 1) };
20601     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20602                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
20603                                   Ops);
20604
20605     // Return { result, isValid, chain }.
20606     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20607                        SDValue(Result.getNode(), 2));
20608   }
20609   case GATHER_AVX2: {
20610     SDValue Chain = Op.getOperand(0);
20611     SDValue Src   = Op.getOperand(2);
20612     SDValue Base  = Op.getOperand(3);
20613     SDValue Index = Op.getOperand(4);
20614     SDValue Mask  = Op.getOperand(5);
20615     SDValue Scale = Op.getOperand(6);
20616     return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20617                              Scale, Chain, Subtarget);
20618   }
20619   case GATHER: {
20620   //gather(v1, mask, index, base, scale);
20621     SDValue Chain = Op.getOperand(0);
20622     SDValue Src   = Op.getOperand(2);
20623     SDValue Base  = Op.getOperand(3);
20624     SDValue Index = Op.getOperand(4);
20625     SDValue Mask  = Op.getOperand(5);
20626     SDValue Scale = Op.getOperand(6);
20627     return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20628                          Chain, Subtarget);
20629   }
20630   case SCATTER: {
20631   //scatter(base, mask, index, v1, scale);
20632     SDValue Chain = Op.getOperand(0);
20633     SDValue Base  = Op.getOperand(2);
20634     SDValue Mask  = Op.getOperand(3);
20635     SDValue Index = Op.getOperand(4);
20636     SDValue Src   = Op.getOperand(5);
20637     SDValue Scale = Op.getOperand(6);
20638     return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20639                           Scale, Chain, Subtarget);
20640   }
20641   case PREFETCH: {
20642     SDValue Hint = Op.getOperand(6);
20643     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20644     assert((HintVal == 2 || HintVal == 3) &&
20645            "Wrong prefetch hint in intrinsic: should be 2 or 3");
20646     unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20647     SDValue Chain = Op.getOperand(0);
20648     SDValue Mask  = Op.getOperand(2);
20649     SDValue Index = Op.getOperand(3);
20650     SDValue Base  = Op.getOperand(4);
20651     SDValue Scale = Op.getOperand(5);
20652     return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20653                            Subtarget);
20654   }
20655   // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20656   case RDTSC: {
20657     SmallVector<SDValue, 2> Results;
20658     getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20659                             Results);
20660     return DAG.getMergeValues(Results, dl);
20661   }
20662   // Read Performance Monitoring Counters.
20663   case RDPMC: {
20664     SmallVector<SDValue, 2> Results;
20665     getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20666     return DAG.getMergeValues(Results, dl);
20667   }
20668   // Get Extended Control Register.
20669   case XGETBV: {
20670     SmallVector<SDValue, 2> Results;
20671     getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20672     return DAG.getMergeValues(Results, dl);
20673   }
20674   // XTEST intrinsics.
20675   case XTEST: {
20676     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20677     SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20678
20679     SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20680     SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20681     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20682                        Ret, SDValue(InTrans.getNode(), 1));
20683   }
20684   // ADC/ADCX/SBB
20685   case ADX: {
20686     SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
20687     SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
20688     SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20689                                 DAG.getConstant(-1, dl, MVT::i8));
20690     SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20691                               Op.getOperand(4), GenCF.getValue(1));
20692     SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20693                                  Op.getOperand(5), MachinePointerInfo());
20694     SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20695     SDValue Results[] = { SetCC, Store };
20696     return DAG.getMergeValues(Results, dl);
20697   }
20698   case COMPRESS_TO_MEM: {
20699     SDValue Mask = Op.getOperand(4);
20700     SDValue DataToCompress = Op.getOperand(3);
20701     SDValue Addr = Op.getOperand(2);
20702     SDValue Chain = Op.getOperand(0);
20703     MVT VT = DataToCompress.getSimpleValueType();
20704
20705     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20706     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20707
20708     if (isAllOnesConstant(Mask)) // return just a store
20709       return DAG.getStore(Chain, dl, DataToCompress, Addr,
20710                           MemIntr->getMemOperand());
20711
20712     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20713     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20714
20715     return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20716                               MemIntr->getMemOperand(),
20717                               false /* truncating */, true /* compressing */);
20718   }
20719   case TRUNCATE_TO_MEM_VI8:
20720   case TRUNCATE_TO_MEM_VI16:
20721   case TRUNCATE_TO_MEM_VI32: {
20722     SDValue Mask = Op.getOperand(4);
20723     SDValue DataToTruncate = Op.getOperand(3);
20724     SDValue Addr = Op.getOperand(2);
20725     SDValue Chain = Op.getOperand(0);
20726
20727     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20728     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20729
20730     EVT MemVT  = MemIntr->getMemoryVT();
20731
20732     uint16_t TruncationOp = IntrData->Opc0;
20733     switch (TruncationOp) {
20734     case X86ISD::VTRUNC: {
20735       if (isAllOnesConstant(Mask)) // return just a truncate store
20736         return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20737                                  MemIntr->getMemOperand());
20738
20739       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20740       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20741
20742       return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20743                                 MemIntr->getMemOperand(), true /* truncating */);
20744     }
20745     case X86ISD::VTRUNCUS:
20746     case X86ISD::VTRUNCS: {
20747       bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20748       if (isAllOnesConstant(Mask))
20749         return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20750                                MemIntr->getMemOperand(), DAG);
20751
20752       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20753       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20754
20755       return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20756                                    VMask, MemVT, MemIntr->getMemOperand(), DAG);
20757     }
20758     default:
20759       llvm_unreachable("Unsupported truncstore intrinsic");
20760     }
20761   }
20762
20763   case EXPAND_FROM_MEM: {
20764     SDValue Mask = Op.getOperand(4);
20765     SDValue PassThru = Op.getOperand(3);
20766     SDValue Addr = Op.getOperand(2);
20767     SDValue Chain = Op.getOperand(0);
20768     MVT VT = Op.getSimpleValueType();
20769
20770     MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20771     assert(MemIntr && "Expected MemIntrinsicSDNode!");
20772
20773     if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20774       return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20775     if (X86::isZeroNode(Mask))
20776       return DAG.getUNDEF(VT);
20777
20778     MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20779     SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20780     return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20781                              MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20782                              true /* expanding */);
20783   }
20784   }
20785 }
20786
20787 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20788                                            SelectionDAG &DAG) const {
20789   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20790   MFI.setReturnAddressIsTaken(true);
20791
20792   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20793     return SDValue();
20794
20795   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20796   SDLoc dl(Op);
20797   EVT PtrVT = getPointerTy(DAG.getDataLayout());
20798
20799   if (Depth > 0) {
20800     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20801     const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20802     SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20803     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20804                        DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20805                        MachinePointerInfo());
20806   }
20807
20808   // Just load the return address.
20809   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20810   return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20811                      MachinePointerInfo());
20812 }
20813
20814 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20815                                                  SelectionDAG &DAG) const {
20816   DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20817   return getReturnAddressFrameIndex(DAG);
20818 }
20819
20820 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20821   MachineFunction &MF = DAG.getMachineFunction();
20822   MachineFrameInfo &MFI = MF.getFrameInfo();
20823   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20824   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20825   EVT VT = Op.getValueType();
20826
20827   MFI.setFrameAddressIsTaken(true);
20828
20829   if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20830     // Depth > 0 makes no sense on targets which use Windows unwind codes.  It
20831     // is not possible to crawl up the stack without looking at the unwind codes
20832     // simultaneously.
20833     int FrameAddrIndex = FuncInfo->getFAIndex();
20834     if (!FrameAddrIndex) {
20835       // Set up a frame object for the return address.
20836       unsigned SlotSize = RegInfo->getSlotSize();
20837       FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20838           SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20839       FuncInfo->setFAIndex(FrameAddrIndex);
20840     }
20841     return DAG.getFrameIndex(FrameAddrIndex, VT);
20842   }
20843
20844   unsigned FrameReg =
20845       RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20846   SDLoc dl(Op);  // FIXME probably not meaningful
20847   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20848   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20849           (FrameReg == X86::EBP && VT == MVT::i32)) &&
20850          "Invalid Frame Register!");
20851   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20852   while (Depth--)
20853     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20854                             MachinePointerInfo());
20855   return FrameAddr;
20856 }
20857
20858 // FIXME? Maybe this could be a TableGen attribute on some registers and
20859 // this table could be generated automatically from RegInfo.
20860 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20861                                               SelectionDAG &DAG) const {
20862   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20863   const MachineFunction &MF = DAG.getMachineFunction();
20864
20865   unsigned Reg = StringSwitch<unsigned>(RegName)
20866                        .Case("esp", X86::ESP)
20867                        .Case("rsp", X86::RSP)
20868                        .Case("ebp", X86::EBP)
20869                        .Case("rbp", X86::RBP)
20870                        .Default(0);
20871
20872   if (Reg == X86::EBP || Reg == X86::RBP) {
20873     if (!TFI.hasFP(MF))
20874       report_fatal_error("register " + StringRef(RegName) +
20875                          " is allocatable: function has no frame pointer");
20876 #ifndef NDEBUG
20877     else {
20878       const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20879       unsigned FrameReg =
20880           RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20881       assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20882              "Invalid Frame Register!");
20883     }
20884 #endif
20885   }
20886
20887   if (Reg)
20888     return Reg;
20889
20890   report_fatal_error("Invalid register name global variable");
20891 }
20892
20893 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20894                                                      SelectionDAG &DAG) const {
20895   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20896   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20897 }
20898
20899 unsigned X86TargetLowering::getExceptionPointerRegister(
20900     const Constant *PersonalityFn) const {
20901   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20902     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20903
20904   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20905 }
20906
20907 unsigned X86TargetLowering::getExceptionSelectorRegister(
20908     const Constant *PersonalityFn) const {
20909   // Funclet personalities don't use selectors (the runtime does the selection).
20910   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20911   return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20912 }
20913
20914 bool X86TargetLowering::needsFixedCatchObjects() const {
20915   return Subtarget.isTargetWin64();
20916 }
20917
20918 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20919   SDValue Chain     = Op.getOperand(0);
20920   SDValue Offset    = Op.getOperand(1);
20921   SDValue Handler   = Op.getOperand(2);
20922   SDLoc dl      (Op);
20923
20924   EVT PtrVT = getPointerTy(DAG.getDataLayout());
20925   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20926   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20927   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20928           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20929          "Invalid Frame Register!");
20930   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20931   unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20932
20933   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20934                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20935                                                        dl));
20936   StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20937   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20938   Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20939
20940   return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20941                      DAG.getRegister(StoreAddrReg, PtrVT));
20942 }
20943
20944 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20945                                                SelectionDAG &DAG) const {
20946   SDLoc DL(Op);
20947   // If the subtarget is not 64bit, we may need the global base reg
20948   // after isel expand pseudo, i.e., after CGBR pass ran.
20949   // Therefore, ask for the GlobalBaseReg now, so that the pass
20950   // inserts the code for us in case we need it.
20951   // Otherwise, we will end up in a situation where we will
20952   // reference a virtual register that is not defined!
20953   if (!Subtarget.is64Bit()) {
20954     const X86InstrInfo *TII = Subtarget.getInstrInfo();
20955     (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20956   }
20957   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20958                      DAG.getVTList(MVT::i32, MVT::Other),
20959                      Op.getOperand(0), Op.getOperand(1));
20960 }
20961
20962 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20963                                                 SelectionDAG &DAG) const {
20964   SDLoc DL(Op);
20965   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20966                      Op.getOperand(0), Op.getOperand(1));
20967 }
20968
20969 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20970                                                        SelectionDAG &DAG) const {
20971   SDLoc DL(Op);
20972   return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20973                      Op.getOperand(0));
20974 }
20975
20976 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20977   return Op.getOperand(0);
20978 }
20979
20980 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20981                                                 SelectionDAG &DAG) const {
20982   SDValue Root = Op.getOperand(0);
20983   SDValue Trmp = Op.getOperand(1); // trampoline
20984   SDValue FPtr = Op.getOperand(2); // nested function
20985   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20986   SDLoc dl (Op);
20987
20988   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20989   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20990
20991   if (Subtarget.is64Bit()) {
20992     SDValue OutChains[6];
20993
20994     // Large code-model.
20995     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
20996     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20997
20998     const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20999     const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21000
21001     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21002
21003     // Load the pointer to the nested function into R11.
21004     unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21005     SDValue Addr = Trmp;
21006     OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21007                                 Addr, MachinePointerInfo(TrmpAddr));
21008
21009     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21010                        DAG.getConstant(2, dl, MVT::i64));
21011     OutChains[1] =
21012         DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21013                      /* Alignment = */ 2);
21014
21015     // Load the 'nest' parameter value into R10.
21016     // R10 is specified in X86CallingConv.td
21017     OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21018     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21019                        DAG.getConstant(10, dl, MVT::i64));
21020     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21021                                 Addr, MachinePointerInfo(TrmpAddr, 10));
21022
21023     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21024                        DAG.getConstant(12, dl, MVT::i64));
21025     OutChains[3] =
21026         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21027                      /* Alignment = */ 2);
21028
21029     // Jump to the nested function.
21030     OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21031     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21032                        DAG.getConstant(20, dl, MVT::i64));
21033     OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21034                                 Addr, MachinePointerInfo(TrmpAddr, 20));
21035
21036     unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21037     Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21038                        DAG.getConstant(22, dl, MVT::i64));
21039     OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21040                                 Addr, MachinePointerInfo(TrmpAddr, 22));
21041
21042     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21043   } else {
21044     const Function *Func =
21045       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21046     CallingConv::ID CC = Func->getCallingConv();
21047     unsigned NestReg;
21048
21049     switch (CC) {
21050     default:
21051       llvm_unreachable("Unsupported calling convention");
21052     case CallingConv::C:
21053     case CallingConv::X86_StdCall: {
21054       // Pass 'nest' parameter in ECX.
21055       // Must be kept in sync with X86CallingConv.td
21056       NestReg = X86::ECX;
21057
21058       // Check that ECX wasn't needed by an 'inreg' parameter.
21059       FunctionType *FTy = Func->getFunctionType();
21060       const AttributeList &Attrs = Func->getAttributes();
21061
21062       if (!Attrs.isEmpty() && !Func->isVarArg()) {
21063         unsigned InRegCount = 0;
21064         unsigned Idx = 1;
21065
21066         for (FunctionType::param_iterator I = FTy->param_begin(),
21067              E = FTy->param_end(); I != E; ++I, ++Idx)
21068           if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21069             auto &DL = DAG.getDataLayout();
21070             // FIXME: should only count parameters that are lowered to integers.
21071             InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21072           }
21073
21074         if (InRegCount > 2) {
21075           report_fatal_error("Nest register in use - reduce number of inreg"
21076                              " parameters!");
21077         }
21078       }
21079       break;
21080     }
21081     case CallingConv::X86_FastCall:
21082     case CallingConv::X86_ThisCall:
21083     case CallingConv::Fast:
21084       // Pass 'nest' parameter in EAX.
21085       // Must be kept in sync with X86CallingConv.td
21086       NestReg = X86::EAX;
21087       break;
21088     }
21089
21090     SDValue OutChains[4];
21091     SDValue Addr, Disp;
21092
21093     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21094                        DAG.getConstant(10, dl, MVT::i32));
21095     Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21096
21097     // This is storing the opcode for MOV32ri.
21098     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21099     const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21100     OutChains[0] =
21101         DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21102                      Trmp, MachinePointerInfo(TrmpAddr));
21103
21104     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21105                        DAG.getConstant(1, dl, MVT::i32));
21106     OutChains[1] =
21107         DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21108                      /* Alignment = */ 1);
21109
21110     const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21111     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21112                        DAG.getConstant(5, dl, MVT::i32));
21113     OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21114                                 Addr, MachinePointerInfo(TrmpAddr, 5),
21115                                 /* Alignment = */ 1);
21116
21117     Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21118                        DAG.getConstant(6, dl, MVT::i32));
21119     OutChains[3] =
21120         DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21121                      /* Alignment = */ 1);
21122
21123     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21124   }
21125 }
21126
21127 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21128                                             SelectionDAG &DAG) const {
21129   /*
21130    The rounding mode is in bits 11:10 of FPSR, and has the following
21131    settings:
21132      00 Round to nearest
21133      01 Round to -inf
21134      10 Round to +inf
21135      11 Round to 0
21136
21137   FLT_ROUNDS, on the other hand, expects the following:
21138     -1 Undefined
21139      0 Round to 0
21140      1 Round to nearest
21141      2 Round to +inf
21142      3 Round to -inf
21143
21144   To perform the conversion, we do:
21145     (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21146   */
21147
21148   MachineFunction &MF = DAG.getMachineFunction();
21149   const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21150   unsigned StackAlignment = TFI.getStackAlignment();
21151   MVT VT = Op.getSimpleValueType();
21152   SDLoc DL(Op);
21153
21154   // Save FP Control Word to stack slot
21155   int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21156   SDValue StackSlot =
21157       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21158
21159   MachineMemOperand *MMO =
21160       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21161                               MachineMemOperand::MOStore, 2, 2);
21162
21163   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21164   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21165                                           DAG.getVTList(MVT::Other),
21166                                           Ops, MVT::i16, MMO);
21167
21168   // Load FP Control Word from stack slot
21169   SDValue CWD =
21170       DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21171
21172   // Transform as necessary
21173   SDValue CWD1 =
21174     DAG.getNode(ISD::SRL, DL, MVT::i16,
21175                 DAG.getNode(ISD::AND, DL, MVT::i16,
21176                             CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21177                 DAG.getConstant(11, DL, MVT::i8));
21178   SDValue CWD2 =
21179     DAG.getNode(ISD::SRL, DL, MVT::i16,
21180                 DAG.getNode(ISD::AND, DL, MVT::i16,
21181                             CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21182                 DAG.getConstant(9, DL, MVT::i8));
21183
21184   SDValue RetVal =
21185     DAG.getNode(ISD::AND, DL, MVT::i16,
21186                 DAG.getNode(ISD::ADD, DL, MVT::i16,
21187                             DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21188                             DAG.getConstant(1, DL, MVT::i16)),
21189                 DAG.getConstant(3, DL, MVT::i16));
21190
21191   return DAG.getNode((VT.getSizeInBits() < 16 ?
21192                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21193 }
21194
21195 // Split an unary integer op into 2 half sized ops.
21196 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21197   MVT VT = Op.getSimpleValueType();
21198   unsigned NumElems = VT.getVectorNumElements();
21199   unsigned SizeInBits = VT.getSizeInBits();
21200
21201   // Extract the Lo/Hi vectors
21202   SDLoc dl(Op);
21203   SDValue Src = Op.getOperand(0);
21204   SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21205   SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21206
21207   MVT EltVT = VT.getVectorElementType();
21208   MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21209   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21210                      DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21211                      DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21212 }
21213
21214 // Decompose 256-bit ops into smaller 128-bit ops.
21215 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21216   assert(Op.getSimpleValueType().is256BitVector() &&
21217          Op.getSimpleValueType().isInteger() &&
21218          "Only handle AVX 256-bit vector integer operation");
21219   return LowerVectorIntUnary(Op, DAG);
21220 }
21221
21222 // Decompose 512-bit ops into smaller 256-bit ops.
21223 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21224   assert(Op.getSimpleValueType().is512BitVector() &&
21225          Op.getSimpleValueType().isInteger() &&
21226          "Only handle AVX 512-bit vector integer operation");
21227   return LowerVectorIntUnary(Op, DAG);
21228 }
21229
21230 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21231 //
21232 // i8/i16 vector implemented using dword LZCNT vector instruction
21233 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21234 // split the vector, perform operation on it's Lo a Hi part and
21235 // concatenate the results.
21236 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21237   assert(Op.getOpcode() == ISD::CTLZ);
21238   SDLoc dl(Op);
21239   MVT VT = Op.getSimpleValueType();
21240   MVT EltVT = VT.getVectorElementType();
21241   unsigned NumElems = VT.getVectorNumElements();
21242
21243   assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21244           "Unsupported element type");
21245
21246   // Split vector, it's Lo and Hi parts will be handled in next iteration.
21247   if (16 < NumElems)
21248     return LowerVectorIntUnary(Op, DAG);
21249
21250   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21251   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21252           "Unsupported value type for operation");
21253
21254   // Use native supported vector instruction vplzcntd.
21255   Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21256   SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21257   SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21258   SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21259
21260   return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21261 }
21262
21263 // Lower CTLZ using a PSHUFB lookup table implementation.
21264 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21265                                        const X86Subtarget &Subtarget,
21266                                        SelectionDAG &DAG) {
21267   MVT VT = Op.getSimpleValueType();
21268   int NumElts = VT.getVectorNumElements();
21269   int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21270   MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21271
21272   // Per-nibble leading zero PSHUFB lookup table.
21273   const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21274                        /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21275                        /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21276                        /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21277
21278   SmallVector<SDValue, 64> LUTVec;
21279   for (int i = 0; i < NumBytes; ++i)
21280     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21281   SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21282
21283   // Begin by bitcasting the input to byte vector, then split those bytes
21284   // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21285   // If the hi input nibble is zero then we add both results together, otherwise
21286   // we just take the hi result (by masking the lo result to zero before the
21287   // add).
21288   SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21289   SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21290
21291   SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21292   SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21293   SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21294   SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21295   SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21296
21297   Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21298   Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21299   Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21300   SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21301
21302   // Merge result back from vXi8 back to VT, working on the lo/hi halves
21303   // of the current vector width in the same way we did for the nibbles.
21304   // If the upper half of the input element is zero then add the halves'
21305   // leading zero counts together, otherwise just use the upper half's.
21306   // Double the width of the result until we are at target width.
21307   while (CurrVT != VT) {
21308     int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21309     int CurrNumElts = CurrVT.getVectorNumElements();
21310     MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21311     MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21312     SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21313
21314     // Check if the upper half of the input element is zero.
21315     SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21316                                DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21317     HiZ = DAG.getBitcast(NextVT, HiZ);
21318
21319     // Move the upper/lower halves to the lower bits as we'll be extending to
21320     // NextVT. Mask the lower result to zero if HiZ is true and add the results
21321     // together.
21322     SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21323     SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21324     SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21325     R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21326     Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21327     CurrVT = NextVT;
21328   }
21329
21330   return Res;
21331 }
21332
21333 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21334                                const X86Subtarget &Subtarget,
21335                                SelectionDAG &DAG) {
21336   MVT VT = Op.getSimpleValueType();
21337
21338   if (Subtarget.hasCDI())
21339     return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21340
21341   // Decompose 256-bit ops into smaller 128-bit ops.
21342   if (VT.is256BitVector() && !Subtarget.hasInt256())
21343     return Lower256IntUnary(Op, DAG);
21344
21345   // Decompose 512-bit ops into smaller 256-bit ops.
21346   if (VT.is512BitVector() && !Subtarget.hasBWI())
21347     return Lower512IntUnary(Op, DAG);
21348
21349   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21350   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21351 }
21352
21353 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21354                          SelectionDAG &DAG) {
21355   MVT VT = Op.getSimpleValueType();
21356   MVT OpVT = VT;
21357   unsigned NumBits = VT.getSizeInBits();
21358   SDLoc dl(Op);
21359   unsigned Opc = Op.getOpcode();
21360
21361   if (VT.isVector())
21362     return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21363
21364   Op = Op.getOperand(0);
21365   if (VT == MVT::i8) {
21366     // Zero extend to i32 since there is not an i8 bsr.
21367     OpVT = MVT::i32;
21368     Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21369   }
21370
21371   // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21372   SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21373   Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21374
21375   if (Opc == ISD::CTLZ) {
21376     // If src is zero (i.e. bsr sets ZF), returns NumBits.
21377     SDValue Ops[] = {
21378       Op,
21379       DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21380       DAG.getConstant(X86::COND_E, dl, MVT::i8),
21381       Op.getValue(1)
21382     };
21383     Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21384   }
21385
21386   // Finally xor with NumBits-1.
21387   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21388                    DAG.getConstant(NumBits - 1, dl, OpVT));
21389
21390   if (VT == MVT::i8)
21391     Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21392   return Op;
21393 }
21394
21395 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21396   MVT VT = Op.getSimpleValueType();
21397   unsigned NumBits = VT.getScalarSizeInBits();
21398   SDLoc dl(Op);
21399
21400   if (VT.isVector()) {
21401     SDValue N0 = Op.getOperand(0);
21402     SDValue Zero = DAG.getConstant(0, dl, VT);
21403
21404     // lsb(x) = (x & -x)
21405     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21406                               DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21407
21408     // cttz_undef(x) = (width - 1) - ctlz(lsb)
21409     if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21410       SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21411       return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21412                          DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21413     }
21414
21415     // cttz(x) = ctpop(lsb - 1)
21416     SDValue One = DAG.getConstant(1, dl, VT);
21417     return DAG.getNode(ISD::CTPOP, dl, VT,
21418                        DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21419   }
21420
21421   assert(Op.getOpcode() == ISD::CTTZ &&
21422          "Only scalar CTTZ requires custom lowering");
21423
21424   // Issue a bsf (scan bits forward) which also sets EFLAGS.
21425   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21426   Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21427
21428   // If src is zero (i.e. bsf sets ZF), returns NumBits.
21429   SDValue Ops[] = {
21430     Op,
21431     DAG.getConstant(NumBits, dl, VT),
21432     DAG.getConstant(X86::COND_E, dl, MVT::i8),
21433     Op.getValue(1)
21434   };
21435   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21436 }
21437
21438 /// Break a 256-bit integer operation into two new 128-bit ones and then
21439 /// concatenate the result back.
21440 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21441   MVT VT = Op.getSimpleValueType();
21442
21443   assert(VT.is256BitVector() && VT.isInteger() &&
21444          "Unsupported value type for operation");
21445
21446   unsigned NumElems = VT.getVectorNumElements();
21447   SDLoc dl(Op);
21448
21449   // Extract the LHS vectors
21450   SDValue LHS = Op.getOperand(0);
21451   SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21452   SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21453
21454   // Extract the RHS vectors
21455   SDValue RHS = Op.getOperand(1);
21456   SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21457   SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21458
21459   MVT EltVT = VT.getVectorElementType();
21460   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21461
21462   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21463                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21464                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21465 }
21466
21467 /// Break a 512-bit integer operation into two new 256-bit ones and then
21468 /// concatenate the result back.
21469 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21470   MVT VT = Op.getSimpleValueType();
21471
21472   assert(VT.is512BitVector() && VT.isInteger() &&
21473          "Unsupported value type for operation");
21474
21475   unsigned NumElems = VT.getVectorNumElements();
21476   SDLoc dl(Op);
21477
21478   // Extract the LHS vectors
21479   SDValue LHS = Op.getOperand(0);
21480   SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21481   SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21482
21483   // Extract the RHS vectors
21484   SDValue RHS = Op.getOperand(1);
21485   SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21486   SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21487
21488   MVT EltVT = VT.getVectorElementType();
21489   MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21490
21491   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21492                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21493                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21494 }
21495
21496 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21497   MVT VT = Op.getSimpleValueType();
21498   if (VT.getScalarType() == MVT::i1)
21499     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21500                        Op.getOperand(0), Op.getOperand(1));
21501   assert(Op.getSimpleValueType().is256BitVector() &&
21502          Op.getSimpleValueType().isInteger() &&
21503          "Only handle AVX 256-bit vector integer operation");
21504   return Lower256IntArith(Op, DAG);
21505 }
21506
21507 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21508   assert(Op.getSimpleValueType().is256BitVector() &&
21509          Op.getSimpleValueType().isInteger() &&
21510          "Only handle AVX 256-bit vector integer operation");
21511   return Lower256IntUnary(Op, DAG);
21512 }
21513
21514 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21515   assert(Op.getSimpleValueType().is256BitVector() &&
21516          Op.getSimpleValueType().isInteger() &&
21517          "Only handle AVX 256-bit vector integer operation");
21518   return Lower256IntArith(Op, DAG);
21519 }
21520
21521 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21522                         SelectionDAG &DAG) {
21523   SDLoc dl(Op);
21524   MVT VT = Op.getSimpleValueType();
21525
21526   if (VT.getScalarType() == MVT::i1)
21527     return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21528
21529   // Decompose 256-bit ops into smaller 128-bit ops.
21530   if (VT.is256BitVector() && !Subtarget.hasInt256())
21531     return Lower256IntArith(Op, DAG);
21532
21533   SDValue A = Op.getOperand(0);
21534   SDValue B = Op.getOperand(1);
21535
21536   // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21537   // vector pairs, multiply and truncate.
21538   if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21539     if (Subtarget.hasInt256()) {
21540       // For 512-bit vectors, split into 256-bit vectors to allow the
21541       // sign-extension to occur.
21542       if (VT == MVT::v64i8)
21543         return Lower512IntArith(Op, DAG);
21544
21545       // For 256-bit vectors, split into 128-bit vectors to allow the
21546       // sign-extension to occur. We don't need this on AVX512BW as we can
21547       // safely sign-extend to v32i16.
21548       if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21549         return Lower256IntArith(Op, DAG);
21550
21551       MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21552       return DAG.getNode(
21553           ISD::TRUNCATE, dl, VT,
21554           DAG.getNode(ISD::MUL, dl, ExVT,
21555                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21556                       DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21557     }
21558
21559     assert(VT == MVT::v16i8 &&
21560            "Pre-AVX2 support only supports v16i8 multiplication");
21561     MVT ExVT = MVT::v8i16;
21562
21563     // Extract the lo parts and sign extend to i16
21564     SDValue ALo, BLo;
21565     if (Subtarget.hasSSE41()) {
21566       ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21567       BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21568     } else {
21569       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21570                               -1, 4, -1, 5, -1, 6, -1, 7};
21571       ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21572       BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21573       ALo = DAG.getBitcast(ExVT, ALo);
21574       BLo = DAG.getBitcast(ExVT, BLo);
21575       ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21576       BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21577     }
21578
21579     // Extract the hi parts and sign extend to i16
21580     SDValue AHi, BHi;
21581     if (Subtarget.hasSSE41()) {
21582       const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21583                               -1, -1, -1, -1, -1, -1, -1, -1};
21584       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21585       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21586       AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21587       BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21588     } else {
21589       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21590                               -1, 12, -1, 13, -1, 14, -1, 15};
21591       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21592       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21593       AHi = DAG.getBitcast(ExVT, AHi);
21594       BHi = DAG.getBitcast(ExVT, BHi);
21595       AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21596       BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21597     }
21598
21599     // Multiply, mask the lower 8bits of the lo/hi results and pack
21600     SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21601     SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21602     RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21603     RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21604     return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21605   }
21606
21607   // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21608   if (VT == MVT::v4i32) {
21609     assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21610            "Should not custom lower when pmuldq is available!");
21611
21612     // Extract the odd parts.
21613     static const int UnpackMask[] = { 1, -1, 3, -1 };
21614     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21615     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21616
21617     // Multiply the even parts.
21618     SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21619     // Now multiply odd parts.
21620     SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21621
21622     Evens = DAG.getBitcast(VT, Evens);
21623     Odds = DAG.getBitcast(VT, Odds);
21624
21625     // Merge the two vectors back together with a shuffle. This expands into 2
21626     // shuffles.
21627     static const int ShufMask[] = { 0, 4, 2, 6 };
21628     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21629   }
21630
21631   assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21632          "Only know how to lower V2I64/V4I64/V8I64 multiply");
21633
21634   // 32-bit vector types used for MULDQ/MULUDQ.
21635   MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21636
21637   // MULDQ returns the 64-bit result of the signed multiplication of the lower
21638   // 32-bits. We can lower with this if the sign bits stretch that far.
21639   if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21640       DAG.ComputeNumSignBits(B) > 32) {
21641     return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21642                        DAG.getBitcast(MulVT, B));
21643   }
21644
21645   //  Ahi = psrlqi(a, 32);
21646   //  Bhi = psrlqi(b, 32);
21647   //
21648   //  AloBlo = pmuludq(a, b);
21649   //  AloBhi = pmuludq(a, Bhi);
21650   //  AhiBlo = pmuludq(Ahi, b);
21651   //
21652   //  Hi = psllqi(AloBhi + AhiBlo, 32);
21653   //  return AloBlo + Hi;
21654   APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21655   bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21656   bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21657
21658   APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21659   bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21660   bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21661
21662   // Bit cast to 32-bit vectors for MULUDQ.
21663   SDValue Alo = DAG.getBitcast(MulVT, A);
21664   SDValue Blo = DAG.getBitcast(MulVT, B);
21665
21666   SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21667
21668   // Only multiply lo/hi halves that aren't known to be zero.
21669   SDValue AloBlo = Zero;
21670   if (!ALoIsZero && !BLoIsZero)
21671     AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21672
21673   SDValue AloBhi = Zero;
21674   if (!ALoIsZero && !BHiIsZero) {
21675     SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21676     Bhi = DAG.getBitcast(MulVT, Bhi);
21677     AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21678   }
21679
21680   SDValue AhiBlo = Zero;
21681   if (!AHiIsZero && !BLoIsZero) {
21682     SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21683     Ahi = DAG.getBitcast(MulVT, Ahi);
21684     AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21685   }
21686
21687   SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21688   Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21689
21690   return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21691 }
21692
21693 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21694                          SelectionDAG &DAG) {
21695   SDLoc dl(Op);
21696   MVT VT = Op.getSimpleValueType();
21697
21698   // Decompose 256-bit ops into smaller 128-bit ops.
21699   if (VT.is256BitVector() && !Subtarget.hasInt256())
21700     return Lower256IntArith(Op, DAG);
21701
21702   // Only i8 vectors should need custom lowering after this.
21703   assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21704          "Unsupported vector type");
21705
21706   // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21707   // logical shift down the upper half and pack back to i8.
21708   SDValue A = Op.getOperand(0);
21709   SDValue B = Op.getOperand(1);
21710
21711   // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21712   // and then ashr/lshr the upper bits down to the lower bits before multiply.
21713   unsigned Opcode = Op.getOpcode();
21714   unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21715   unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21716
21717   // AVX2 implementations - extend xmm subvectors to ymm.
21718   if (Subtarget.hasInt256()) {
21719     SDValue Lo = DAG.getIntPtrConstant(0, dl);
21720     SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21721
21722     if (VT == MVT::v32i8) {
21723       SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21724       SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21725       SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21726       SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21727       ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21728       BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21729       AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21730       BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21731       Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21732                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21733                        DAG.getConstant(8, dl, MVT::v16i16));
21734       Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21735                        DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21736                        DAG.getConstant(8, dl, MVT::v16i16));
21737       // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21738       // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21739       const int LoMask[] = {0,  1,  2,  3,  4,  5,  6,  7,
21740                             16, 17, 18, 19, 20, 21, 22, 23};
21741       const int HiMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21742                             24, 25, 26, 27, 28, 29, 30, 31};
21743       return DAG.getNode(X86ISD::PACKUS, dl, VT,
21744                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21745                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21746     }
21747
21748     SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21749     SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21750     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21751     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21752                                DAG.getConstant(8, dl, MVT::v16i16));
21753     Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21754     Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21755     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21756   }
21757
21758   assert(VT == MVT::v16i8 &&
21759          "Pre-AVX2 support only supports v16i8 multiplication");
21760   MVT ExVT = MVT::v8i16;
21761
21762   // Extract the lo parts and zero/sign extend to i16.
21763   SDValue ALo, BLo;
21764   if (Subtarget.hasSSE41()) {
21765     ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21766     BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21767   } else {
21768     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21769                             -1, 4, -1, 5, -1, 6, -1, 7};
21770     ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21771     BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21772     ALo = DAG.getBitcast(ExVT, ALo);
21773     BLo = DAG.getBitcast(ExVT, BLo);
21774     ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21775     BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21776   }
21777
21778   // Extract the hi parts and zero/sign extend to i16.
21779   SDValue AHi, BHi;
21780   if (Subtarget.hasSSE41()) {
21781     const int ShufMask[] = {8,  9,  10, 11, 12, 13, 14, 15,
21782                             -1, -1, -1, -1, -1, -1, -1, -1};
21783     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21784     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21785     AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21786     BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21787   } else {
21788     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
21789                             -1, 12, -1, 13, -1, 14, -1, 15};
21790     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21791     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21792     AHi = DAG.getBitcast(ExVT, AHi);
21793     BHi = DAG.getBitcast(ExVT, BHi);
21794     AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21795     BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21796   }
21797
21798   // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21799   // pack back to v16i8.
21800   SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21801   SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21802   RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21803   RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21804   return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21805 }
21806
21807 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21808   assert(Subtarget.isTargetWin64() && "Unexpected target");
21809   EVT VT = Op.getValueType();
21810   assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21811          "Unexpected return type for lowering");
21812
21813   RTLIB::Libcall LC;
21814   bool isSigned;
21815   switch (Op->getOpcode()) {
21816   default: llvm_unreachable("Unexpected request for libcall!");
21817   case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
21818   case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
21819   case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
21820   case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
21821   case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
21822   case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21823   }
21824
21825   SDLoc dl(Op);
21826   SDValue InChain = DAG.getEntryNode();
21827
21828   TargetLowering::ArgListTy Args;
21829   TargetLowering::ArgListEntry Entry;
21830   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21831     EVT ArgVT = Op->getOperand(i).getValueType();
21832     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21833            "Unexpected argument type for lowering");
21834     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21835     Entry.Node = StackPtr;
21836     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21837                            MachinePointerInfo(), /* Alignment = */ 16);
21838     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21839     Entry.Ty = PointerType::get(ArgTy,0);
21840     Entry.IsSExt = false;
21841     Entry.IsZExt = false;
21842     Args.push_back(Entry);
21843   }
21844
21845   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21846                                          getPointerTy(DAG.getDataLayout()));
21847
21848   TargetLowering::CallLoweringInfo CLI(DAG);
21849   CLI.setDebugLoc(dl)
21850       .setChain(InChain)
21851       .setLibCallee(
21852           getLibcallCallingConv(LC),
21853           static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21854           std::move(Args))
21855       .setInRegister()
21856       .setSExtResult(isSigned)
21857       .setZExtResult(!isSigned);
21858
21859   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21860   return DAG.getBitcast(VT, CallInfo.first);
21861 }
21862
21863 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21864                              SelectionDAG &DAG) {
21865   SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21866   MVT VT = Op0.getSimpleValueType();
21867   SDLoc dl(Op);
21868
21869   // Decompose 256-bit ops into smaller 128-bit ops.
21870   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21871     unsigned Opcode = Op.getOpcode();
21872     unsigned NumElems = VT.getVectorNumElements();
21873     MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21874     SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21875     SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21876     SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21877     SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21878     SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21879     SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21880     SDValue Ops[] = {
21881       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21882       DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21883     };
21884     return DAG.getMergeValues(Ops, dl);
21885   }
21886
21887   assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21888          (VT == MVT::v8i32 && Subtarget.hasInt256()));
21889
21890   // PMULxD operations multiply each even value (starting at 0) of LHS with
21891   // the related value of RHS and produce a widen result.
21892   // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21893   // => <2 x i64> <ae|cg>
21894   //
21895   // In other word, to have all the results, we need to perform two PMULxD:
21896   // 1. one with the even values.
21897   // 2. one with the odd values.
21898   // To achieve #2, with need to place the odd values at an even position.
21899   //
21900   // Place the odd value at an even position (basically, shift all values 1
21901   // step to the left):
21902   const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21903   // <a|b|c|d> => <b|undef|d|undef>
21904   SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21905                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21906   // <e|f|g|h> => <f|undef|h|undef>
21907   SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21908                              makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21909
21910   // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21911   // ints.
21912   MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21913   bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21914   unsigned Opcode =
21915       (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21916   // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21917   // => <2 x i64> <ae|cg>
21918   SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21919   // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21920   // => <2 x i64> <bf|dh>
21921   SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21922
21923   // Shuffle it back into the right order.
21924   SDValue Highs, Lows;
21925   if (VT == MVT::v8i32) {
21926     const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21927     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21928     const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21929     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21930   } else {
21931     const int HighMask[] = {1, 5, 3, 7};
21932     Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21933     const int LowMask[] = {0, 4, 2, 6};
21934     Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21935   }
21936
21937   // If we have a signed multiply but no PMULDQ fix up the high parts of a
21938   // unsigned multiply.
21939   if (IsSigned && !Subtarget.hasSSE41()) {
21940     SDValue ShAmt = DAG.getConstant(
21941         31, dl,
21942         DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21943     SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21944                              DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21945     SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21946                              DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21947
21948     SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21949     Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21950   }
21951
21952   // The first result of MUL_LOHI is actually the low value, followed by the
21953   // high value.
21954   SDValue Ops[] = {Lows, Highs};
21955   return DAG.getMergeValues(Ops, dl);
21956 }
21957
21958 // Return true if the required (according to Opcode) shift-imm form is natively
21959 // supported by the Subtarget
21960 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21961                                         unsigned Opcode) {
21962   if (VT.getScalarSizeInBits() < 16)
21963     return false;
21964
21965   if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21966       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21967     return true;
21968
21969   bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21970                 (VT.is256BitVector() && Subtarget.hasInt256());
21971
21972   bool AShift = LShift && (Subtarget.hasAVX512() ||
21973                            (VT != MVT::v2i64 && VT != MVT::v4i64));
21974   return (Opcode == ISD::SRA) ? AShift : LShift;
21975 }
21976
21977 // The shift amount is a variable, but it is the same for all vector lanes.
21978 // These instructions are defined together with shift-immediate.
21979 static
21980 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21981                                       unsigned Opcode) {
21982   return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21983 }
21984
21985 // Return true if the required (according to Opcode) variable-shift form is
21986 // natively supported by the Subtarget
21987 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21988                                     unsigned Opcode) {
21989
21990   if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21991     return false;
21992
21993   // vXi16 supported only on AVX-512, BWI
21994   if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21995     return false;
21996
21997   if (Subtarget.hasAVX512())
21998     return true;
21999
22000   bool LShift = VT.is128BitVector() || VT.is256BitVector();
22001   bool AShift = LShift &&  VT != MVT::v2i64 && VT != MVT::v4i64;
22002   return (Opcode == ISD::SRA) ? AShift : LShift;
22003 }
22004
22005 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22006                                          const X86Subtarget &Subtarget) {
22007   MVT VT = Op.getSimpleValueType();
22008   SDLoc dl(Op);
22009   SDValue R = Op.getOperand(0);
22010   SDValue Amt = Op.getOperand(1);
22011
22012   unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22013     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22014
22015   auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22016     assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22017     MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22018     SDValue Ex = DAG.getBitcast(ExVT, R);
22019
22020     // ashr(R, 63) === cmp_slt(R, 0)
22021     if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22022       assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22023              "Unsupported PCMPGT op");
22024       return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22025                          getZeroVector(VT, Subtarget, DAG, dl), R);
22026     }
22027
22028     if (ShiftAmt >= 32) {
22029       // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22030       SDValue Upper =
22031           getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22032       SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22033                                                  ShiftAmt - 32, DAG);
22034       if (VT == MVT::v2i64)
22035         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22036       if (VT == MVT::v4i64)
22037         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22038                                   {9, 1, 11, 3, 13, 5, 15, 7});
22039     } else {
22040       // SRA upper i32, SHL whole i64 and select lower i32.
22041       SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22042                                                  ShiftAmt, DAG);
22043       SDValue Lower =
22044           getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22045       Lower = DAG.getBitcast(ExVT, Lower);
22046       if (VT == MVT::v2i64)
22047         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22048       if (VT == MVT::v4i64)
22049         Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22050                                   {8, 1, 10, 3, 12, 5, 14, 7});
22051     }
22052     return DAG.getBitcast(VT, Ex);
22053   };
22054
22055   // Optimize shl/srl/sra with constant shift amount.
22056   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22057     if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22058       uint64_t ShiftAmt = ShiftConst->getZExtValue();
22059
22060       if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22061         return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22062
22063       // i64 SRA needs to be performed as partial shifts.
22064       if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22065            (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22066           Op.getOpcode() == ISD::SRA)
22067         return ArithmeticShiftRight64(ShiftAmt);
22068
22069       if (VT == MVT::v16i8 ||
22070           (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22071           VT == MVT::v64i8) {
22072         unsigned NumElts = VT.getVectorNumElements();
22073         MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22074
22075         // Simple i8 add case
22076         if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22077           return DAG.getNode(ISD::ADD, dl, VT, R, R);
22078
22079         // ashr(R, 7)  === cmp_slt(R, 0)
22080         if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22081           SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22082           if (VT.is512BitVector()) {
22083             assert(VT == MVT::v64i8 && "Unexpected element type!");
22084             SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22085             return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22086           }
22087           return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22088         }
22089
22090         // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22091         if (VT == MVT::v16i8 && Subtarget.hasXOP())
22092           return SDValue();
22093
22094         if (Op.getOpcode() == ISD::SHL) {
22095           // Make a large shift.
22096           SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22097                                                    R, ShiftAmt, DAG);
22098           SHL = DAG.getBitcast(VT, SHL);
22099           // Zero out the rightmost bits.
22100           return DAG.getNode(ISD::AND, dl, VT, SHL,
22101                              DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22102         }
22103         if (Op.getOpcode() == ISD::SRL) {
22104           // Make a large shift.
22105           SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22106                                                    R, ShiftAmt, DAG);
22107           SRL = DAG.getBitcast(VT, SRL);
22108           // Zero out the leftmost bits.
22109           return DAG.getNode(ISD::AND, dl, VT, SRL,
22110                              DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22111         }
22112         if (Op.getOpcode() == ISD::SRA) {
22113           // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22114           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22115
22116           SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22117           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22118           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22119           return Res;
22120         }
22121         llvm_unreachable("Unknown shift opcode.");
22122       }
22123     }
22124   }
22125
22126   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22127   // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22128   if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
22129       (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22130        (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22131
22132     // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22133     unsigned SubVectorScale = 1;
22134     if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22135       SubVectorScale =
22136           Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22137       Amt = Amt.getOperand(0);
22138     }
22139
22140     // Peek through any splat that was introduced for i64 shift vectorization.
22141     int SplatIndex = -1;
22142     if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22143       if (SVN->isSplat()) {
22144         SplatIndex = SVN->getSplatIndex();
22145         Amt = Amt.getOperand(0);
22146         assert(SplatIndex < (int)VT.getVectorNumElements() &&
22147                "Splat shuffle referencing second operand");
22148       }
22149
22150     if (Amt.getOpcode() != ISD::BITCAST ||
22151         Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22152       return SDValue();
22153
22154     Amt = Amt.getOperand(0);
22155     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22156                      (SubVectorScale * VT.getVectorNumElements());
22157     unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22158     uint64_t ShiftAmt = 0;
22159     unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22160     for (unsigned i = 0; i != Ratio; ++i) {
22161       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22162       if (!C)
22163         return SDValue();
22164       // 6 == Log2(64)
22165       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22166     }
22167
22168     // Check remaining shift amounts (if not a splat).
22169     if (SplatIndex < 0) {
22170       for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22171         uint64_t ShAmt = 0;
22172         for (unsigned j = 0; j != Ratio; ++j) {
22173           ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22174           if (!C)
22175             return SDValue();
22176           // 6 == Log2(64)
22177           ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22178         }
22179         if (ShAmt != ShiftAmt)
22180           return SDValue();
22181       }
22182     }
22183
22184     if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22185       return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22186
22187     if (Op.getOpcode() == ISD::SRA)
22188       return ArithmeticShiftRight64(ShiftAmt);
22189   }
22190
22191   return SDValue();
22192 }
22193
22194 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22195                                         const X86Subtarget &Subtarget) {
22196   MVT VT = Op.getSimpleValueType();
22197   SDLoc dl(Op);
22198   SDValue R = Op.getOperand(0);
22199   SDValue Amt = Op.getOperand(1);
22200
22201   unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22202     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22203
22204   unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22205     (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22206
22207   if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22208     SDValue BaseShAmt;
22209     MVT EltVT = VT.getVectorElementType();
22210
22211     if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22212       // Check if this build_vector node is doing a splat.
22213       // If so, then set BaseShAmt equal to the splat value.
22214       BaseShAmt = BV->getSplatValue();
22215       if (BaseShAmt && BaseShAmt.isUndef())
22216         BaseShAmt = SDValue();
22217     } else {
22218       if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22219         Amt = Amt.getOperand(0);
22220
22221       ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22222       if (SVN && SVN->isSplat()) {
22223         unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22224         SDValue InVec = Amt.getOperand(0);
22225         if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22226           assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22227                  "Unexpected shuffle index found!");
22228           BaseShAmt = InVec.getOperand(SplatIdx);
22229         } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22230            if (ConstantSDNode *C =
22231                dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22232              if (C->getZExtValue() == SplatIdx)
22233                BaseShAmt = InVec.getOperand(1);
22234            }
22235         }
22236
22237         if (!BaseShAmt)
22238           // Avoid introducing an extract element from a shuffle.
22239           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22240                                   DAG.getIntPtrConstant(SplatIdx, dl));
22241       }
22242     }
22243
22244     if (BaseShAmt.getNode()) {
22245       assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22246       if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22247         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22248       else if (EltVT.bitsLT(MVT::i32))
22249         BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22250
22251       return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22252     }
22253   }
22254
22255   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22256   if (!Subtarget.is64Bit() && VT == MVT::v2i64  &&
22257       Amt.getOpcode() == ISD::BITCAST &&
22258       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22259     Amt = Amt.getOperand(0);
22260     unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22261                      VT.getVectorNumElements();
22262     std::vector<SDValue> Vals(Ratio);
22263     for (unsigned i = 0; i != Ratio; ++i)
22264       Vals[i] = Amt.getOperand(i);
22265     for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22266       for (unsigned j = 0; j != Ratio; ++j)
22267         if (Vals[j] != Amt.getOperand(i + j))
22268           return SDValue();
22269     }
22270
22271     if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22272       return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22273   }
22274   return SDValue();
22275 }
22276
22277 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22278                           SelectionDAG &DAG) {
22279   MVT VT = Op.getSimpleValueType();
22280   SDLoc dl(Op);
22281   SDValue R = Op.getOperand(0);
22282   SDValue Amt = Op.getOperand(1);
22283   bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22284
22285   assert(VT.isVector() && "Custom lowering only for vector shifts!");
22286   assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22287
22288   if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22289     return V;
22290
22291   if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22292     return V;
22293
22294   if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22295     return Op;
22296
22297   // XOP has 128-bit variable logical/arithmetic shifts.
22298   // +ve/-ve Amt = shift left/right.
22299   if (Subtarget.hasXOP() &&
22300       (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22301        VT == MVT::v8i16 || VT == MVT::v16i8)) {
22302     if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22303       SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22304       Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22305     }
22306     if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22307       return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22308     if (Op.getOpcode() == ISD::SRA)
22309       return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22310   }
22311
22312   // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22313   // shifts per-lane and then shuffle the partial results back together.
22314   if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22315     // Splat the shift amounts so the scalar shifts above will catch it.
22316     SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22317     SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22318     SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22319     SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22320     return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22321   }
22322
22323   // i64 vector arithmetic shift can be emulated with the transform:
22324   // M = lshr(SIGN_MASK, Amt)
22325   // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22326   if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22327       Op.getOpcode() == ISD::SRA) {
22328     SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22329     SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22330     R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22331     R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22332     R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22333     return R;
22334   }
22335
22336   // If possible, lower this packed shift into a vector multiply instead of
22337   // expanding it into a sequence of scalar shifts.
22338   // Do this only if the vector shift count is a constant build_vector.
22339   if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22340       (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22341        (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22342     SmallVector<SDValue, 8> Elts;
22343     MVT SVT = VT.getVectorElementType();
22344     unsigned SVTBits = SVT.getSizeInBits();
22345     APInt One(SVTBits, 1);
22346     unsigned NumElems = VT.getVectorNumElements();
22347
22348     for (unsigned i=0; i !=NumElems; ++i) {
22349       SDValue Op = Amt->getOperand(i);
22350       if (Op->isUndef()) {
22351         Elts.push_back(Op);
22352         continue;
22353       }
22354
22355       ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22356       APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22357       uint64_t ShAmt = C.getZExtValue();
22358       if (ShAmt >= SVTBits) {
22359         Elts.push_back(DAG.getUNDEF(SVT));
22360         continue;
22361       }
22362       Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22363     }
22364     SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22365     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22366   }
22367
22368   // Lower SHL with variable shift amount.
22369   if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22370     Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22371
22372     Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22373                      DAG.getConstant(0x3f800000U, dl, VT));
22374     Op = DAG.getBitcast(MVT::v4f32, Op);
22375     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22376     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22377   }
22378
22379   // If possible, lower this shift as a sequence of two shifts by
22380   // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22381   // Example:
22382   //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22383   //
22384   // Could be rewritten as:
22385   //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22386   //
22387   // The advantage is that the two shifts from the example would be
22388   // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22389   // the vector shift into four scalar shifts plus four pairs of vector
22390   // insert/extract.
22391   if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22392     unsigned TargetOpcode = X86ISD::MOVSS;
22393     bool CanBeSimplified;
22394     // The splat value for the first packed shift (the 'X' from the example).
22395     SDValue Amt1 = Amt->getOperand(0);
22396     // The splat value for the second packed shift (the 'Y' from the example).
22397     SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22398
22399     // See if it is possible to replace this node with a sequence of
22400     // two shifts followed by a MOVSS/MOVSD/PBLEND.
22401     if (VT == MVT::v4i32) {
22402       // Check if it is legal to use a MOVSS.
22403       CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22404                         Amt2 == Amt->getOperand(3);
22405       if (!CanBeSimplified) {
22406         // Otherwise, check if we can still simplify this node using a MOVSD.
22407         CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22408                           Amt->getOperand(2) == Amt->getOperand(3);
22409         TargetOpcode = X86ISD::MOVSD;
22410         Amt2 = Amt->getOperand(2);
22411       }
22412     } else {
22413       // Do similar checks for the case where the machine value type
22414       // is MVT::v8i16.
22415       CanBeSimplified = Amt1 == Amt->getOperand(1);
22416       for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22417         CanBeSimplified = Amt2 == Amt->getOperand(i);
22418
22419       if (!CanBeSimplified) {
22420         TargetOpcode = X86ISD::MOVSD;
22421         CanBeSimplified = true;
22422         Amt2 = Amt->getOperand(4);
22423         for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22424           CanBeSimplified = Amt1 == Amt->getOperand(i);
22425         for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22426           CanBeSimplified = Amt2 == Amt->getOperand(j);
22427       }
22428     }
22429
22430     if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22431         isa<ConstantSDNode>(Amt2)) {
22432       // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22433       MVT CastVT = MVT::v4i32;
22434       SDValue Splat1 =
22435           DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22436       SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22437       SDValue Splat2 =
22438           DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22439       SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22440       SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22441       SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22442       if (TargetOpcode == X86ISD::MOVSD)
22443         return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22444                                                        BitCast2, {0, 1, 6, 7}));
22445       return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22446                                                      BitCast2, {0, 5, 6, 7}));
22447     }
22448   }
22449
22450   // v4i32 Non Uniform Shifts.
22451   // If the shift amount is constant we can shift each lane using the SSE2
22452   // immediate shifts, else we need to zero-extend each lane to the lower i64
22453   // and shift using the SSE2 variable shifts.
22454   // The separate results can then be blended together.
22455   if (VT == MVT::v4i32) {
22456     unsigned Opc = Op.getOpcode();
22457     SDValue Amt0, Amt1, Amt2, Amt3;
22458     if (ConstantAmt) {
22459       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22460       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22461       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22462       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22463     } else {
22464       // ISD::SHL is handled above but we include it here for completeness.
22465       switch (Opc) {
22466       default:
22467         llvm_unreachable("Unknown target vector shift node");
22468       case ISD::SHL:
22469         Opc = X86ISD::VSHL;
22470         break;
22471       case ISD::SRL:
22472         Opc = X86ISD::VSRL;
22473         break;
22474       case ISD::SRA:
22475         Opc = X86ISD::VSRA;
22476         break;
22477       }
22478       // The SSE2 shifts use the lower i64 as the same shift amount for
22479       // all lanes and the upper i64 is ignored. These shuffle masks
22480       // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22481       SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22482       Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22483       Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22484       Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22485       Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22486     }
22487
22488     SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22489     SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22490     SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22491     SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22492     SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22493     SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22494     return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22495   }
22496
22497   // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22498   // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22499   // make the existing SSE solution better.
22500   if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22501       (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22502       (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22503       (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22504     MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22505     MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22506     unsigned ExtOpc =
22507         Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22508     R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22509     Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22510     return DAG.getNode(ISD::TRUNCATE, dl, VT,
22511                        DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22512   }
22513
22514   if (VT == MVT::v16i8 ||
22515       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22516       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22517     MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22518     unsigned ShiftOpcode = Op->getOpcode();
22519
22520     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22521       if (VT.is512BitVector()) {
22522         // On AVX512BW targets we make use of the fact that VSELECT lowers
22523         // to a masked blend which selects bytes based just on the sign bit
22524         // extracted to a mask.
22525         MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22526         V0 = DAG.getBitcast(VT, V0);
22527         V1 = DAG.getBitcast(VT, V1);
22528         Sel = DAG.getBitcast(VT, Sel);
22529         Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22530         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22531       } else if (Subtarget.hasSSE41()) {
22532         // On SSE41 targets we make use of the fact that VSELECT lowers
22533         // to PBLENDVB which selects bytes based just on the sign bit.
22534         V0 = DAG.getBitcast(VT, V0);
22535         V1 = DAG.getBitcast(VT, V1);
22536         Sel = DAG.getBitcast(VT, Sel);
22537         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22538       }
22539       // On pre-SSE41 targets we test for the sign bit by comparing to
22540       // zero - a negative value will set all bits of the lanes to true
22541       // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22542       SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22543       SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22544       return DAG.getSelect(dl, SelVT, C, V0, V1);
22545     };
22546
22547     // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22548     // We can safely do this using i16 shifts as we're only interested in
22549     // the 3 lower bits of each byte.
22550     Amt = DAG.getBitcast(ExtVT, Amt);
22551     Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22552     Amt = DAG.getBitcast(VT, Amt);
22553
22554     if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22555       // r = VSELECT(r, shift(r, 4), a);
22556       SDValue M =
22557           DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22558       R = SignBitSelect(VT, Amt, M, R);
22559
22560       // a += a
22561       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22562
22563       // r = VSELECT(r, shift(r, 2), a);
22564       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22565       R = SignBitSelect(VT, Amt, M, R);
22566
22567       // a += a
22568       Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22569
22570       // return VSELECT(r, shift(r, 1), a);
22571       M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22572       R = SignBitSelect(VT, Amt, M, R);
22573       return R;
22574     }
22575
22576     if (Op->getOpcode() == ISD::SRA) {
22577       // For SRA we need to unpack each byte to the higher byte of a i16 vector
22578       // so we can correctly sign extend. We don't care what happens to the
22579       // lower byte.
22580       SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22581       SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22582       SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22583       SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22584       ALo = DAG.getBitcast(ExtVT, ALo);
22585       AHi = DAG.getBitcast(ExtVT, AHi);
22586       RLo = DAG.getBitcast(ExtVT, RLo);
22587       RHi = DAG.getBitcast(ExtVT, RHi);
22588
22589       // r = VSELECT(r, shift(r, 4), a);
22590       SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22591                                 DAG.getConstant(4, dl, ExtVT));
22592       SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22593                                 DAG.getConstant(4, dl, ExtVT));
22594       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22595       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22596
22597       // a += a
22598       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22599       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22600
22601       // r = VSELECT(r, shift(r, 2), a);
22602       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22603                         DAG.getConstant(2, dl, ExtVT));
22604       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22605                         DAG.getConstant(2, dl, ExtVT));
22606       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22607       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22608
22609       // a += a
22610       ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22611       AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22612
22613       // r = VSELECT(r, shift(r, 1), a);
22614       MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22615                         DAG.getConstant(1, dl, ExtVT));
22616       MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22617                         DAG.getConstant(1, dl, ExtVT));
22618       RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22619       RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22620
22621       // Logical shift the result back to the lower byte, leaving a zero upper
22622       // byte
22623       // meaning that we can safely pack with PACKUSWB.
22624       RLo =
22625           DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22626       RHi =
22627           DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22628       return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22629     }
22630   }
22631
22632   if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22633     MVT ExtVT = MVT::v8i32;
22634     SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22635     SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22636     SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22637     SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22638     SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22639     ALo = DAG.getBitcast(ExtVT, ALo);
22640     AHi = DAG.getBitcast(ExtVT, AHi);
22641     RLo = DAG.getBitcast(ExtVT, RLo);
22642     RHi = DAG.getBitcast(ExtVT, RHi);
22643     SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22644     SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22645     Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22646     Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22647     return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22648   }
22649
22650   if (VT == MVT::v8i16) {
22651     unsigned ShiftOpcode = Op->getOpcode();
22652
22653     // If we have a constant shift amount, the non-SSE41 path is best as
22654     // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22655     bool UseSSE41 = Subtarget.hasSSE41() &&
22656                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22657
22658     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22659       // On SSE41 targets we make use of the fact that VSELECT lowers
22660       // to PBLENDVB which selects bytes based just on the sign bit.
22661       if (UseSSE41) {
22662         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22663         V0 = DAG.getBitcast(ExtVT, V0);
22664         V1 = DAG.getBitcast(ExtVT, V1);
22665         Sel = DAG.getBitcast(ExtVT, Sel);
22666         return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22667       }
22668       // On pre-SSE41 targets we splat the sign bit - a negative value will
22669       // set all bits of the lanes to true and VSELECT uses that in
22670       // its OR(AND(V0,C),AND(V1,~C)) lowering.
22671       SDValue C =
22672           DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22673       return DAG.getSelect(dl, VT, C, V0, V1);
22674     };
22675
22676     // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22677     if (UseSSE41) {
22678       // On SSE41 targets we need to replicate the shift mask in both
22679       // bytes for PBLENDVB.
22680       Amt = DAG.getNode(
22681           ISD::OR, dl, VT,
22682           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22683           DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22684     } else {
22685       Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22686     }
22687
22688     // r = VSELECT(r, shift(r, 8), a);
22689     SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22690     R = SignBitSelect(Amt, M, R);
22691
22692     // a += a
22693     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22694
22695     // r = VSELECT(r, shift(r, 4), a);
22696     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22697     R = SignBitSelect(Amt, M, R);
22698
22699     // a += a
22700     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22701
22702     // r = VSELECT(r, shift(r, 2), a);
22703     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22704     R = SignBitSelect(Amt, M, R);
22705
22706     // a += a
22707     Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22708
22709     // return VSELECT(r, shift(r, 1), a);
22710     M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22711     R = SignBitSelect(Amt, M, R);
22712     return R;
22713   }
22714
22715   // Decompose 256-bit shifts into smaller 128-bit shifts.
22716   if (VT.is256BitVector())
22717     return Lower256IntArith(Op, DAG);
22718
22719   return SDValue();
22720 }
22721
22722 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22723                            SelectionDAG &DAG) {
22724   MVT VT = Op.getSimpleValueType();
22725   SDLoc DL(Op);
22726   SDValue R = Op.getOperand(0);
22727   SDValue Amt = Op.getOperand(1);
22728   unsigned Opcode = Op.getOpcode();
22729   unsigned EltSizeInBits = VT.getScalarSizeInBits();
22730
22731   if (Subtarget.hasAVX512()) {
22732     // Attempt to rotate by immediate.
22733     APInt UndefElts;
22734     SmallVector<APInt, 16> EltBits;
22735     if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
22736       if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
22737             return EltBits[0] == V;
22738           })) {
22739         unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
22740         uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
22741         return DAG.getNode(Op, DL, VT, R,
22742                            DAG.getConstant(RotateAmt, DL, MVT::i8));
22743       }
22744     }
22745
22746     // Else, fall-back on VPROLV/VPRORV.
22747     return Op;
22748   }
22749
22750   assert(VT.isVector() && "Custom lowering only for vector rotates!");
22751   assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22752   assert((Opcode == ISD::ROTL) && "Only ROTL supported");
22753
22754   // XOP has 128-bit vector variable + immediate rotates.
22755   // +ve/-ve Amt = rotate left/right.
22756
22757   // Split 256-bit integers.
22758   if (VT.is256BitVector())
22759     return Lower256IntArith(Op, DAG);
22760
22761   assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22762
22763   // Attempt to rotate by immediate.
22764   if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22765     if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22766       uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22767       assert(RotateAmt < EltSizeInBits && "Rotation out of range");
22768       return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22769                          DAG.getConstant(RotateAmt, DL, MVT::i8));
22770     }
22771   }
22772
22773   // Use general rotate by variable (per-element).
22774   return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22775 }
22776
22777 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22778   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22779   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22780   // looks for this combo and may remove the "setcc" instruction if the "setcc"
22781   // has only one use.
22782   SDNode *N = Op.getNode();
22783   SDValue LHS = N->getOperand(0);
22784   SDValue RHS = N->getOperand(1);
22785   unsigned BaseOp = 0;
22786   X86::CondCode Cond;
22787   SDLoc DL(Op);
22788   switch (Op.getOpcode()) {
22789   default: llvm_unreachable("Unknown ovf instruction!");
22790   case ISD::SADDO:
22791     // A subtract of one will be selected as a INC. Note that INC doesn't
22792     // set CF, so we can't do this for UADDO.
22793     if (isOneConstant(RHS)) {
22794       BaseOp = X86ISD::INC;
22795       Cond = X86::COND_O;
22796       break;
22797     }
22798     BaseOp = X86ISD::ADD;
22799     Cond = X86::COND_O;
22800     break;
22801   case ISD::UADDO:
22802     BaseOp = X86ISD::ADD;
22803     Cond = X86::COND_B;
22804     break;
22805   case ISD::SSUBO:
22806     // A subtract of one will be selected as a DEC. Note that DEC doesn't
22807     // set CF, so we can't do this for USUBO.
22808     if (isOneConstant(RHS)) {
22809       BaseOp = X86ISD::DEC;
22810       Cond = X86::COND_O;
22811       break;
22812     }
22813     BaseOp = X86ISD::SUB;
22814     Cond = X86::COND_O;
22815     break;
22816   case ISD::USUBO:
22817     BaseOp = X86ISD::SUB;
22818     Cond = X86::COND_B;
22819     break;
22820   case ISD::SMULO:
22821     BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22822     Cond = X86::COND_O;
22823     break;
22824   case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22825     if (N->getValueType(0) == MVT::i8) {
22826       BaseOp = X86ISD::UMUL8;
22827       Cond = X86::COND_O;
22828       break;
22829     }
22830     SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22831                                  MVT::i32);
22832     SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22833
22834     SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22835
22836     if (N->getValueType(1) == MVT::i1)
22837       SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22838
22839     return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22840   }
22841   }
22842
22843   // Also sets EFLAGS.
22844   SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22845   SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22846
22847   SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22848
22849   if (N->getValueType(1) == MVT::i1)
22850     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22851
22852   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22853 }
22854
22855 /// Returns true if the operand type is exactly twice the native width, and
22856 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22857 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22858 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22859 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22860   unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22861
22862   if (OpWidth == 64)
22863     return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22864   else if (OpWidth == 128)
22865     return Subtarget.hasCmpxchg16b();
22866   else
22867     return false;
22868 }
22869
22870 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22871   return needsCmpXchgNb(SI->getValueOperand()->getType());
22872 }
22873
22874 // Note: this turns large loads into lock cmpxchg8b/16b.
22875 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22876 TargetLowering::AtomicExpansionKind
22877 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22878   auto PTy = cast<PointerType>(LI->getPointerOperandType());
22879   return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22880                                                : AtomicExpansionKind::None;
22881 }
22882
22883 TargetLowering::AtomicExpansionKind
22884 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22885   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22886   Type *MemType = AI->getType();
22887
22888   // If the operand is too big, we must see if cmpxchg8/16b is available
22889   // and default to library calls otherwise.
22890   if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22891     return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22892                                    : AtomicExpansionKind::None;
22893   }
22894
22895   AtomicRMWInst::BinOp Op = AI->getOperation();
22896   switch (Op) {
22897   default:
22898     llvm_unreachable("Unknown atomic operation");
22899   case AtomicRMWInst::Xchg:
22900   case AtomicRMWInst::Add:
22901   case AtomicRMWInst::Sub:
22902     // It's better to use xadd, xsub or xchg for these in all cases.
22903     return AtomicExpansionKind::None;
22904   case AtomicRMWInst::Or:
22905   case AtomicRMWInst::And:
22906   case AtomicRMWInst::Xor:
22907     // If the atomicrmw's result isn't actually used, we can just add a "lock"
22908     // prefix to a normal instruction for these operations.
22909     return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22910                             : AtomicExpansionKind::None;
22911   case AtomicRMWInst::Nand:
22912   case AtomicRMWInst::Max:
22913   case AtomicRMWInst::Min:
22914   case AtomicRMWInst::UMax:
22915   case AtomicRMWInst::UMin:
22916     // These always require a non-trivial set of data operations on x86. We must
22917     // use a cmpxchg loop.
22918     return AtomicExpansionKind::CmpXChg;
22919   }
22920 }
22921
22922 LoadInst *
22923 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22924   unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22925   Type *MemType = AI->getType();
22926   // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22927   // there is no benefit in turning such RMWs into loads, and it is actually
22928   // harmful as it introduces a mfence.
22929   if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22930     return nullptr;
22931
22932   auto Builder = IRBuilder<>(AI);
22933   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22934   auto SSID = AI->getSyncScopeID();
22935   // We must restrict the ordering to avoid generating loads with Release or
22936   // ReleaseAcquire orderings.
22937   auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22938   auto Ptr = AI->getPointerOperand();
22939
22940   // Before the load we need a fence. Here is an example lifted from
22941   // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22942   // is required:
22943   // Thread 0:
22944   //   x.store(1, relaxed);
22945   //   r1 = y.fetch_add(0, release);
22946   // Thread 1:
22947   //   y.fetch_add(42, acquire);
22948   //   r2 = x.load(relaxed);
22949   // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22950   // lowered to just a load without a fence. A mfence flushes the store buffer,
22951   // making the optimization clearly correct.
22952   // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22953   // otherwise, we might be able to be more aggressive on relaxed idempotent
22954   // rmw. In practice, they do not look useful, so we don't try to be
22955   // especially clever.
22956   if (SSID == SyncScope::SingleThread)
22957     // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22958     // the IR level, so we must wrap it in an intrinsic.
22959     return nullptr;
22960
22961   if (!Subtarget.hasMFence())
22962     // FIXME: it might make sense to use a locked operation here but on a
22963     // different cache-line to prevent cache-line bouncing. In practice it
22964     // is probably a small win, and x86 processors without mfence are rare
22965     // enough that we do not bother.
22966     return nullptr;
22967
22968   Function *MFence =
22969       llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22970   Builder.CreateCall(MFence, {});
22971
22972   // Finally we can emit the atomic load.
22973   LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22974           AI->getType()->getPrimitiveSizeInBits());
22975   Loaded->setAtomic(Order, SSID);
22976   AI->replaceAllUsesWith(Loaded);
22977   AI->eraseFromParent();
22978   return Loaded;
22979 }
22980
22981 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22982                                  SelectionDAG &DAG) {
22983   SDLoc dl(Op);
22984   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22985     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22986   SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
22987     cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22988
22989   // The only fence that needs an instruction is a sequentially-consistent
22990   // cross-thread fence.
22991   if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22992       FenceSSID == SyncScope::System) {
22993     if (Subtarget.hasMFence())
22994       return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22995
22996     SDValue Chain = Op.getOperand(0);
22997     SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22998     SDValue Ops[] = {
22999       DAG.getRegister(X86::ESP, MVT::i32),     // Base
23000       DAG.getTargetConstant(1, dl, MVT::i8),   // Scale
23001       DAG.getRegister(0, MVT::i32),            // Index
23002       DAG.getTargetConstant(0, dl, MVT::i32),  // Disp
23003       DAG.getRegister(0, MVT::i32),            // Segment.
23004       Zero,
23005       Chain
23006     };
23007     SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23008     return SDValue(Res, 0);
23009   }
23010
23011   // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23012   return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23013 }
23014
23015 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23016                              SelectionDAG &DAG) {
23017   MVT T = Op.getSimpleValueType();
23018   SDLoc DL(Op);
23019   unsigned Reg = 0;
23020   unsigned size = 0;
23021   switch(T.SimpleTy) {
23022   default: llvm_unreachable("Invalid value type!");
23023   case MVT::i8:  Reg = X86::AL;  size = 1; break;
23024   case MVT::i16: Reg = X86::AX;  size = 2; break;
23025   case MVT::i32: Reg = X86::EAX; size = 4; break;
23026   case MVT::i64:
23027     assert(Subtarget.is64Bit() && "Node not type legal!");
23028     Reg = X86::RAX; size = 8;
23029     break;
23030   }
23031   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23032                                   Op.getOperand(2), SDValue());
23033   SDValue Ops[] = { cpIn.getValue(0),
23034                     Op.getOperand(1),
23035                     Op.getOperand(3),
23036                     DAG.getTargetConstant(size, DL, MVT::i8),
23037                     cpIn.getValue(1) };
23038   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23039   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23040   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23041                                            Ops, T, MMO);
23042
23043   SDValue cpOut =
23044     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23045   SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23046                                       MVT::i32, cpOut.getValue(2));
23047   SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23048
23049   DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23050   DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23051   DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23052   return SDValue();
23053 }
23054
23055 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23056                             SelectionDAG &DAG) {
23057   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23058   MVT DstVT = Op.getSimpleValueType();
23059
23060   if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23061       SrcVT == MVT::i64) {
23062     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23063     if (DstVT != MVT::f64)
23064       // This conversion needs to be expanded.
23065       return SDValue();
23066
23067     SDValue Op0 = Op->getOperand(0);
23068     SmallVector<SDValue, 16> Elts;
23069     SDLoc dl(Op);
23070     unsigned NumElts;
23071     MVT SVT;
23072     if (SrcVT.isVector()) {
23073       NumElts = SrcVT.getVectorNumElements();
23074       SVT = SrcVT.getVectorElementType();
23075
23076       // Widen the vector in input in the case of MVT::v2i32.
23077       // Example: from MVT::v2i32 to MVT::v4i32.
23078       for (unsigned i = 0, e = NumElts; i != e; ++i)
23079         Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23080                                    DAG.getIntPtrConstant(i, dl)));
23081     } else {
23082       assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23083              "Unexpected source type in LowerBITCAST");
23084       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23085                                  DAG.getIntPtrConstant(0, dl)));
23086       Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23087                                  DAG.getIntPtrConstant(1, dl)));
23088       NumElts = 2;
23089       SVT = MVT::i32;
23090     }
23091     // Explicitly mark the extra elements as Undef.
23092     Elts.append(NumElts, DAG.getUNDEF(SVT));
23093
23094     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23095     SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23096     SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23097     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23098                        DAG.getIntPtrConstant(0, dl));
23099   }
23100
23101   assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23102          Subtarget.hasMMX() && "Unexpected custom BITCAST");
23103   assert((DstVT == MVT::i64 ||
23104           (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23105          "Unexpected custom BITCAST");
23106   // i64 <=> MMX conversions are Legal.
23107   if (SrcVT==MVT::i64 && DstVT.isVector())
23108     return Op;
23109   if (DstVT==MVT::i64 && SrcVT.isVector())
23110     return Op;
23111   // MMX <=> MMX conversions are Legal.
23112   if (SrcVT.isVector() && DstVT.isVector())
23113     return Op;
23114   // All other conversions need to be expanded.
23115   return SDValue();
23116 }
23117
23118 /// Compute the horizontal sum of bytes in V for the elements of VT.
23119 ///
23120 /// Requires V to be a byte vector and VT to be an integer vector type with
23121 /// wider elements than V's type. The width of the elements of VT determines
23122 /// how many bytes of V are summed horizontally to produce each element of the
23123 /// result.
23124 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23125                                       const X86Subtarget &Subtarget,
23126                                       SelectionDAG &DAG) {
23127   SDLoc DL(V);
23128   MVT ByteVecVT = V.getSimpleValueType();
23129   MVT EltVT = VT.getVectorElementType();
23130   assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23131          "Expected value to have byte element type.");
23132   assert(EltVT != MVT::i8 &&
23133          "Horizontal byte sum only makes sense for wider elements!");
23134   unsigned VecSize = VT.getSizeInBits();
23135   assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23136
23137   // PSADBW instruction horizontally add all bytes and leave the result in i64
23138   // chunks, thus directly computes the pop count for v2i64 and v4i64.
23139   if (EltVT == MVT::i64) {
23140     SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23141     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23142     V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23143     return DAG.getBitcast(VT, V);
23144   }
23145
23146   if (EltVT == MVT::i32) {
23147     // We unpack the low half and high half into i32s interleaved with zeros so
23148     // that we can use PSADBW to horizontally sum them. The most useful part of
23149     // this is that it lines up the results of two PSADBW instructions to be
23150     // two v2i64 vectors which concatenated are the 4 population counts. We can
23151     // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23152     SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23153     SDValue V32 = DAG.getBitcast(VT, V);
23154     SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23155     SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23156
23157     // Do the horizontal sums into two v2i64s.
23158     Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23159     MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23160     Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23161                       DAG.getBitcast(ByteVecVT, Low), Zeros);
23162     High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23163                        DAG.getBitcast(ByteVecVT, High), Zeros);
23164
23165     // Merge them together.
23166     MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23167     V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23168                     DAG.getBitcast(ShortVecVT, Low),
23169                     DAG.getBitcast(ShortVecVT, High));
23170
23171     return DAG.getBitcast(VT, V);
23172   }
23173
23174   // The only element type left is i16.
23175   assert(EltVT == MVT::i16 && "Unknown how to handle type");
23176
23177   // To obtain pop count for each i16 element starting from the pop count for
23178   // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23179   // right by 8. It is important to shift as i16s as i8 vector shift isn't
23180   // directly supported.
23181   SDValue ShifterV = DAG.getConstant(8, DL, VT);
23182   SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23183   V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23184                   DAG.getBitcast(ByteVecVT, V));
23185   return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23186 }
23187
23188 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23189                                         const X86Subtarget &Subtarget,
23190                                         SelectionDAG &DAG) {
23191   MVT VT = Op.getSimpleValueType();
23192   MVT EltVT = VT.getVectorElementType();
23193   unsigned VecSize = VT.getSizeInBits();
23194
23195   // Implement a lookup table in register by using an algorithm based on:
23196   // http://wm.ite.pl/articles/sse-popcount.html
23197   //
23198   // The general idea is that every lower byte nibble in the input vector is an
23199   // index into a in-register pre-computed pop count table. We then split up the
23200   // input vector in two new ones: (1) a vector with only the shifted-right
23201   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23202   // masked out higher ones) for each byte. PSHUFB is used separately with both
23203   // to index the in-register table. Next, both are added and the result is a
23204   // i8 vector where each element contains the pop count for input byte.
23205   //
23206   // To obtain the pop count for elements != i8, we follow up with the same
23207   // approach and use additional tricks as described below.
23208   //
23209   const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23210                        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23211                        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23212                        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23213
23214   int NumByteElts = VecSize / 8;
23215   MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23216   SDValue In = DAG.getBitcast(ByteVecVT, Op);
23217   SmallVector<SDValue, 64> LUTVec;
23218   for (int i = 0; i < NumByteElts; ++i)
23219     LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23220   SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23221   SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23222
23223   // High nibbles
23224   SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23225   SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23226
23227   // Low nibbles
23228   SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23229
23230   // The input vector is used as the shuffle mask that index elements into the
23231   // LUT. After counting low and high nibbles, add the vector to obtain the
23232   // final pop count per i8 element.
23233   SDValue HighPopCnt =
23234       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23235   SDValue LowPopCnt =
23236       DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23237   SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23238
23239   if (EltVT == MVT::i8)
23240     return PopCnt;
23241
23242   return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23243 }
23244
23245 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23246                                        const X86Subtarget &Subtarget,
23247                                        SelectionDAG &DAG) {
23248   MVT VT = Op.getSimpleValueType();
23249   assert(VT.is128BitVector() &&
23250          "Only 128-bit vector bitmath lowering supported.");
23251
23252   int VecSize = VT.getSizeInBits();
23253   MVT EltVT = VT.getVectorElementType();
23254   int Len = EltVT.getSizeInBits();
23255
23256   // This is the vectorized version of the "best" algorithm from
23257   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23258   // with a minor tweak to use a series of adds + shifts instead of vector
23259   // multiplications. Implemented for all integer vector types. We only use
23260   // this when we don't have SSSE3 which allows a LUT-based lowering that is
23261   // much faster, even faster than using native popcnt instructions.
23262
23263   auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23264     MVT VT = V.getSimpleValueType();
23265     SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23266     return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23267   };
23268   auto GetMask = [&](SDValue V, APInt Mask) {
23269     MVT VT = V.getSimpleValueType();
23270     SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23271     return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23272   };
23273
23274   // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23275   // x86, so set the SRL type to have elements at least i16 wide. This is
23276   // correct because all of our SRLs are followed immediately by a mask anyways
23277   // that handles any bits that sneak into the high bits of the byte elements.
23278   MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23279
23280   SDValue V = Op;
23281
23282   // v = v - ((v >> 1) & 0x55555555...)
23283   SDValue Srl =
23284       DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23285   SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23286   V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23287
23288   // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23289   SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23290   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23291   SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23292   V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23293
23294   // v = (v + (v >> 4)) & 0x0F0F0F0F...
23295   Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23296   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23297   V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23298
23299   // At this point, V contains the byte-wise population count, and we are
23300   // merely doing a horizontal sum if necessary to get the wider element
23301   // counts.
23302   if (EltVT == MVT::i8)
23303     return V;
23304
23305   return LowerHorizontalByteSum(
23306       DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23307       DAG);
23308 }
23309
23310 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23311 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23312 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23313                                 SelectionDAG &DAG) {
23314   MVT VT = Op.getSimpleValueType();
23315   assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23316          "Unknown CTPOP type to handle");
23317   SDLoc DL(Op.getNode());
23318   SDValue Op0 = Op.getOperand(0);
23319
23320   // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23321   if (Subtarget.hasVPOPCNTDQ()) {
23322     if (VT == MVT::v8i16) {
23323       Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23324       Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23325       return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23326     }
23327     if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23328       Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23329       Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23330       return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23331     }
23332   }
23333
23334   if (!Subtarget.hasSSSE3()) {
23335     // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23336     assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23337     return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23338   }
23339
23340   // Decompose 256-bit ops into smaller 128-bit ops.
23341   if (VT.is256BitVector() && !Subtarget.hasInt256())
23342     return Lower256IntUnary(Op, DAG);
23343
23344   // Decompose 512-bit ops into smaller 256-bit ops.
23345   if (VT.is512BitVector() && !Subtarget.hasBWI())
23346     return Lower512IntUnary(Op, DAG);
23347
23348   return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23349 }
23350
23351 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23352                           SelectionDAG &DAG) {
23353   assert(Op.getSimpleValueType().isVector() &&
23354          "We only do custom lowering for vector population count.");
23355   return LowerVectorCTPOP(Op, Subtarget, DAG);
23356 }
23357
23358 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23359   MVT VT = Op.getSimpleValueType();
23360   SDValue In = Op.getOperand(0);
23361   SDLoc DL(Op);
23362
23363   // For scalars, its still beneficial to transfer to/from the SIMD unit to
23364   // perform the BITREVERSE.
23365   if (!VT.isVector()) {
23366     MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23367     SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23368     Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23369     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23370                        DAG.getIntPtrConstant(0, DL));
23371   }
23372
23373   int NumElts = VT.getVectorNumElements();
23374   int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23375
23376   // Decompose 256-bit ops into smaller 128-bit ops.
23377   if (VT.is256BitVector())
23378     return Lower256IntUnary(Op, DAG);
23379
23380   assert(VT.is128BitVector() &&
23381          "Only 128-bit vector bitreverse lowering supported.");
23382
23383   // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23384   // perform the BSWAP in the shuffle.
23385   // Its best to shuffle using the second operand as this will implicitly allow
23386   // memory folding for multiple vectors.
23387   SmallVector<SDValue, 16> MaskElts;
23388   for (int i = 0; i != NumElts; ++i) {
23389     for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23390       int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23391       int PermuteByte = SourceByte | (2 << 5);
23392       MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23393     }
23394   }
23395
23396   SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23397   SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23398   Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23399                     Res, Mask);
23400   return DAG.getBitcast(VT, Res);
23401 }
23402
23403 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23404                                SelectionDAG &DAG) {
23405   if (Subtarget.hasXOP())
23406     return LowerBITREVERSE_XOP(Op, DAG);
23407
23408   assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23409
23410   MVT VT = Op.getSimpleValueType();
23411   SDValue In = Op.getOperand(0);
23412   SDLoc DL(Op);
23413
23414   unsigned NumElts = VT.getVectorNumElements();
23415   assert(VT.getScalarType() == MVT::i8 &&
23416          "Only byte vector BITREVERSE supported");
23417
23418   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23419   if (VT.is256BitVector() && !Subtarget.hasInt256())
23420     return Lower256IntUnary(Op, DAG);
23421
23422   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23423   // two nibbles and a PSHUFB lookup to find the bitreverse of each
23424   // 0-15 value (moved to the other nibble).
23425   SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23426   SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23427   SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23428
23429   const int LoLUT[16] = {
23430       /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23431       /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23432       /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23433       /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23434   const int HiLUT[16] = {
23435       /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23436       /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23437       /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23438       /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23439
23440   SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23441   for (unsigned i = 0; i < NumElts; ++i) {
23442     LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23443     HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23444   }
23445
23446   SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23447   SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23448   Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23449   Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23450   return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23451 }
23452
23453 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23454   unsigned NewOpc = 0;
23455   switch (N->getOpcode()) {
23456   case ISD::ATOMIC_LOAD_ADD:
23457     NewOpc = X86ISD::LADD;
23458     break;
23459   case ISD::ATOMIC_LOAD_SUB:
23460     NewOpc = X86ISD::LSUB;
23461     break;
23462   case ISD::ATOMIC_LOAD_OR:
23463     NewOpc = X86ISD::LOR;
23464     break;
23465   case ISD::ATOMIC_LOAD_XOR:
23466     NewOpc = X86ISD::LXOR;
23467     break;
23468   case ISD::ATOMIC_LOAD_AND:
23469     NewOpc = X86ISD::LAND;
23470     break;
23471   default:
23472     llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23473   }
23474
23475   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23476   return DAG.getMemIntrinsicNode(
23477       NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23478       {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23479       /*MemVT=*/N->getSimpleValueType(0), MMO);
23480 }
23481
23482 /// Lower atomic_load_ops into LOCK-prefixed operations.
23483 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23484                                 const X86Subtarget &Subtarget) {
23485   SDValue Chain = N->getOperand(0);
23486   SDValue LHS = N->getOperand(1);
23487   SDValue RHS = N->getOperand(2);
23488   unsigned Opc = N->getOpcode();
23489   MVT VT = N->getSimpleValueType(0);
23490   SDLoc DL(N);
23491
23492   // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23493   // can only be lowered when the result is unused.  They should have already
23494   // been transformed into a cmpxchg loop in AtomicExpand.
23495   if (N->hasAnyUseOfValue(0)) {
23496     // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23497     // select LXADD if LOCK_SUB can't be selected.
23498     if (Opc == ISD::ATOMIC_LOAD_SUB) {
23499       AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23500       RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23501       return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23502                            RHS, AN->getMemOperand());
23503     }
23504     assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23505            "Used AtomicRMW ops other than Add should have been expanded!");
23506     return N;
23507   }
23508
23509   SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23510   // RAUW the chain, but don't worry about the result, as it's unused.
23511   assert(!N->hasAnyUseOfValue(0));
23512   DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23513   return SDValue();
23514 }
23515
23516 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23517   SDNode *Node = Op.getNode();
23518   SDLoc dl(Node);
23519   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23520
23521   // Convert seq_cst store -> xchg
23522   // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23523   // FIXME: On 32-bit, store -> fist or movq would be more efficient
23524   //        (The only way to get a 16-byte store is cmpxchg16b)
23525   // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23526   if (cast<AtomicSDNode>(Node)->getOrdering() ==
23527           AtomicOrdering::SequentiallyConsistent ||
23528       !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23529     SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23530                                  cast<AtomicSDNode>(Node)->getMemoryVT(),
23531                                  Node->getOperand(0),
23532                                  Node->getOperand(1), Node->getOperand(2),
23533                                  cast<AtomicSDNode>(Node)->getMemOperand());
23534     return Swap.getValue(1);
23535   }
23536   // Other atomic stores have a simple pattern.
23537   return Op;
23538 }
23539
23540 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23541   SDNode *N = Op.getNode();
23542   MVT VT = N->getSimpleValueType(0);
23543
23544   // Let legalize expand this if it isn't a legal type yet.
23545   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23546     return SDValue();
23547
23548   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23549   SDLoc DL(N);
23550
23551   // Set the carry flag.
23552   SDValue Carry = Op.getOperand(2);
23553   EVT CarryVT = Carry.getValueType();
23554   APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23555   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23556                       Carry, DAG.getConstant(NegOne, DL, CarryVT));
23557
23558   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23559   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23560                             Op.getOperand(1), Carry.getValue(1));
23561
23562   SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23563   if (N->getValueType(1) == MVT::i1)
23564     SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23565
23566   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23567 }
23568
23569 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23570                             SelectionDAG &DAG) {
23571   assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23572
23573   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23574   // which returns the values as { float, float } (in XMM0) or
23575   // { double, double } (which is returned in XMM0, XMM1).
23576   SDLoc dl(Op);
23577   SDValue Arg = Op.getOperand(0);
23578   EVT ArgVT = Arg.getValueType();
23579   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23580
23581   TargetLowering::ArgListTy Args;
23582   TargetLowering::ArgListEntry Entry;
23583
23584   Entry.Node = Arg;
23585   Entry.Ty = ArgTy;
23586   Entry.IsSExt = false;
23587   Entry.IsZExt = false;
23588   Args.push_back(Entry);
23589
23590   bool isF64 = ArgVT == MVT::f64;
23591   // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23592   // the small struct {f32, f32} is returned in (eax, edx). For f64,
23593   // the results are returned via SRet in memory.
23594   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
23595   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23596   SDValue Callee =
23597       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23598
23599   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23600                       : (Type *)VectorType::get(ArgTy, 4);
23601
23602   TargetLowering::CallLoweringInfo CLI(DAG);
23603   CLI.setDebugLoc(dl)
23604       .setChain(DAG.getEntryNode())
23605       .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23606
23607   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23608
23609   if (isF64)
23610     // Returned in xmm0 and xmm1.
23611     return CallResult.first;
23612
23613   // Returned in bits 0:31 and 32:64 xmm0.
23614   SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23615                                CallResult.first, DAG.getIntPtrConstant(0, dl));
23616   SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23617                                CallResult.first, DAG.getIntPtrConstant(1, dl));
23618   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23619   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23620 }
23621
23622 /// Widen a vector input to a vector of NVT.  The
23623 /// input vector must have the same element type as NVT.
23624 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23625                             bool FillWithZeroes = false) {
23626   // Check if InOp already has the right width.
23627   MVT InVT = InOp.getSimpleValueType();
23628   if (InVT == NVT)
23629     return InOp;
23630
23631   if (InOp.isUndef())
23632     return DAG.getUNDEF(NVT);
23633
23634   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23635          "input and widen element type must match");
23636
23637   unsigned InNumElts = InVT.getVectorNumElements();
23638   unsigned WidenNumElts = NVT.getVectorNumElements();
23639   assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23640          "Unexpected request for vector widening");
23641
23642   SDLoc dl(InOp);
23643   if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23644       InOp.getNumOperands() == 2) {
23645     SDValue N1 = InOp.getOperand(1);
23646     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23647         N1.isUndef()) {
23648       InOp = InOp.getOperand(0);
23649       InVT = InOp.getSimpleValueType();
23650       InNumElts = InVT.getVectorNumElements();
23651     }
23652   }
23653   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23654       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23655     SmallVector<SDValue, 16> Ops;
23656     for (unsigned i = 0; i < InNumElts; ++i)
23657       Ops.push_back(InOp.getOperand(i));
23658
23659     EVT EltVT = InOp.getOperand(0).getValueType();
23660
23661     SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23662       DAG.getUNDEF(EltVT);
23663     for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23664       Ops.push_back(FillVal);
23665     return DAG.getBuildVector(NVT, dl, Ops);
23666   }
23667   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23668     DAG.getUNDEF(NVT);
23669   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23670                      InOp, DAG.getIntPtrConstant(0, dl));
23671 }
23672
23673 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23674                              SelectionDAG &DAG) {
23675   assert(Subtarget.hasAVX512() &&
23676          "MGATHER/MSCATTER are supported on AVX-512 arch only");
23677
23678   // X86 scatter kills mask register, so its type should be added to
23679   // the list of return values.
23680   // If the "scatter" has 2 return values, it is already handled.
23681   if (Op.getNode()->getNumValues() == 2)
23682     return Op;
23683
23684   MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23685   SDValue Src = N->getValue();
23686   MVT VT = Src.getSimpleValueType();
23687   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23688   SDLoc dl(Op);
23689
23690   SDValue NewScatter;
23691   SDValue Index = N->getIndex();
23692   SDValue Mask = N->getMask();
23693   SDValue Chain = N->getChain();
23694   SDValue BasePtr = N->getBasePtr();
23695   MVT MemVT = N->getMemoryVT().getSimpleVT();
23696   MVT IndexVT = Index.getSimpleValueType();
23697   MVT MaskVT = Mask.getSimpleValueType();
23698
23699   if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23700     // The v2i32 value was promoted to v2i64.
23701     // Now we "redo" the type legalizer's work and widen the original
23702     // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23703     // with a shuffle.
23704     assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23705            "Unexpected memory type");
23706     int ShuffleMask[] = {0, 2, -1, -1};
23707     Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23708                                DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23709     // Now we have 4 elements instead of 2.
23710     // Expand the index.
23711     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23712     Index = ExtendToType(Index, NewIndexVT, DAG);
23713
23714     // Expand the mask with zeroes
23715     // Mask may be <2 x i64> or <2 x i1> at this moment
23716     assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23717            "Unexpected mask type");
23718     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23719     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23720     VT = MVT::v4i32;
23721   }
23722
23723   unsigned NumElts = VT.getVectorNumElements();
23724   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23725       !Index.getSimpleValueType().is512BitVector()) {
23726     // AVX512F supports only 512-bit vectors. Or data or index should
23727     // be 512 bit wide. If now the both index and data are 256-bit, but
23728     // the vector contains 8 elements, we just sign-extend the index
23729     if (IndexVT == MVT::v8i32)
23730       // Just extend index
23731       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23732     else {
23733       // The minimal number of elts in scatter is 8
23734       NumElts = 8;
23735       // Index
23736       MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23737       // Use original index here, do not modify the index twice
23738       Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23739       if (IndexVT.getScalarType() == MVT::i32)
23740         Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23741
23742       // Mask
23743       // At this point we have promoted mask operand
23744       assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23745       MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23746       // Use the original mask here, do not modify the mask twice
23747       Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23748
23749       // The value that should be stored
23750       MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23751       Src = ExtendToType(Src, NewVT, DAG);
23752     }
23753   }
23754   // If the mask is "wide" at this point - truncate it to i1 vector
23755   MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23756   Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23757
23758   // The mask is killed by scatter, add it to the values
23759   SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23760   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23761   NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23762                                     N->getMemOperand());
23763   DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23764   return SDValue(NewScatter.getNode(), 1);
23765 }
23766
23767 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23768                           SelectionDAG &DAG) {
23769
23770   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23771   MVT VT = Op.getSimpleValueType();
23772   MVT ScalarVT = VT.getScalarType();
23773   SDValue Mask = N->getMask();
23774   SDLoc dl(Op);
23775
23776   assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23777          "Expanding masked load is supported on AVX-512 target only!");
23778
23779   assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23780          "Expanding masked load is supported for 32 and 64-bit types only!");
23781
23782   // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23783   // VLX. These types for exp-loads are handled here.
23784   if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23785     return Op;
23786
23787   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23788          "Cannot lower masked load op.");
23789
23790   assert((ScalarVT.getSizeInBits() >= 32 ||
23791           (Subtarget.hasBWI() &&
23792               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23793          "Unsupported masked load op.");
23794
23795   // This operation is legal for targets with VLX, but without
23796   // VLX the vector should be widened to 512 bit
23797   unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23798   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23799   SDValue Src0 = N->getSrc0();
23800   Src0 = ExtendToType(Src0, WideDataVT, DAG);
23801
23802   // Mask element has to be i1.
23803   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23804   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23805          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23806
23807   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23808
23809   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23810   if (MaskEltTy != MVT::i1)
23811     Mask = DAG.getNode(ISD::TRUNCATE, dl,
23812                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23813   SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23814                                       N->getBasePtr(), Mask, Src0,
23815                                       N->getMemoryVT(), N->getMemOperand(),
23816                                       N->getExtensionType(),
23817                                       N->isExpandingLoad());
23818
23819   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23820                                NewLoad.getValue(0),
23821                                DAG.getIntPtrConstant(0, dl));
23822   SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23823   return DAG.getMergeValues(RetOps, dl);
23824 }
23825
23826 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23827                            SelectionDAG &DAG) {
23828   MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23829   SDValue DataToStore = N->getValue();
23830   MVT VT = DataToStore.getSimpleValueType();
23831   MVT ScalarVT = VT.getScalarType();
23832   SDValue Mask = N->getMask();
23833   SDLoc dl(Op);
23834
23835   assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23836          "Expanding masked load is supported on AVX-512 target only!");
23837
23838   assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23839          "Expanding masked load is supported for 32 and 64-bit types only!");
23840
23841   // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23842   if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23843     return Op;
23844
23845   assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23846          "Cannot lower masked store op.");
23847
23848   assert((ScalarVT.getSizeInBits() >= 32 ||
23849           (Subtarget.hasBWI() &&
23850               (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23851           "Unsupported masked store op.");
23852
23853   // This operation is legal for targets with VLX, but without
23854   // VLX the vector should be widened to 512 bit
23855   unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23856   MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23857
23858   // Mask element has to be i1.
23859   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23860   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23861          "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23862
23863   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23864
23865   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23866   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23867   if (MaskEltTy != MVT::i1)
23868     Mask = DAG.getNode(ISD::TRUNCATE, dl,
23869                        MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23870   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23871                             Mask, N->getMemoryVT(), N->getMemOperand(),
23872                             N->isTruncatingStore(), N->isCompressingStore());
23873 }
23874
23875 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23876                             SelectionDAG &DAG) {
23877   assert(Subtarget.hasAVX512() &&
23878          "MGATHER/MSCATTER are supported on AVX-512 arch only");
23879
23880   MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23881   SDLoc dl(Op);
23882   MVT VT = Op.getSimpleValueType();
23883   SDValue Index = N->getIndex();
23884   SDValue Mask = N->getMask();
23885   SDValue Src0 = N->getValue();
23886   MVT IndexVT = Index.getSimpleValueType();
23887   MVT MaskVT = Mask.getSimpleValueType();
23888
23889   unsigned NumElts = VT.getVectorNumElements();
23890   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23891
23892   if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23893       !Index.getSimpleValueType().is512BitVector()) {
23894     // AVX512F supports only 512-bit vectors. Or data or index should
23895     // be 512 bit wide. If now the both index and data are 256-bit, but
23896     // the vector contains 8 elements, we just sign-extend the index
23897     if (NumElts == 8) {
23898       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23899       SDValue Ops[] = { N->getOperand(0), N->getOperand(1),  N->getOperand(2),
23900                         N->getOperand(3), Index };
23901       DAG.UpdateNodeOperands(N, Ops);
23902       return Op;
23903     }
23904
23905     // Minimal number of elements in Gather
23906     NumElts = 8;
23907     // Index
23908     MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23909     Index = ExtendToType(Index, NewIndexVT, DAG);
23910     if (IndexVT.getScalarType() == MVT::i32)
23911       Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23912
23913     // Mask
23914     MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23915     // At this point we have promoted mask operand
23916     assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23917     MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23918     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23919     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23920
23921     // The pass-through value
23922     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23923     Src0 = ExtendToType(Src0, NewVT, DAG);
23924
23925     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23926     SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23927                                             N->getMemoryVT(), dl, Ops,
23928                                             N->getMemOperand());
23929     SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23930                                  NewGather.getValue(0),
23931                                  DAG.getIntPtrConstant(0, dl));
23932     SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23933     return DAG.getMergeValues(RetOps, dl);
23934   }
23935   if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
23936     // There is a special case when the return type is v2i32 is illegal and
23937     // the type legaizer extended it to v2i64. Without this conversion we end up
23938     // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23939     // In order to avoid this situation, we'll build an X86 specific Gather node
23940     // with index v2i64 and value type v4i32.
23941     assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
23942            "Unexpected type in masked gather");
23943     Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23944                                 DAG.getBitcast(MVT::v4i32, Src0),
23945                                 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23946     // The mask should match the destination type. Extending mask with zeroes
23947     // is not necessary since instruction itself reads only two values from
23948     // memory.
23949     Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23950     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23951     SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23952       DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23953       N->getMemOperand());
23954
23955     SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23956                                   NewGather.getValue(0), DAG);
23957     SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23958     return DAG.getMergeValues(RetOps, dl);
23959   }
23960   if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
23961     // This transformation is for optimization only.
23962     // The type legalizer extended mask and index to 4 elements vector
23963     // in order to match requirements of the common gather node - same
23964     // vector width of index and value. X86 Gather node allows mismatch
23965     // of vector width in order to select more optimal instruction at the
23966     // end.
23967     assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
23968            "Unexpected type in masked gather");
23969     if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23970         ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23971         Index.getOpcode() == ISD::CONCAT_VECTORS &&
23972         Index.getOperand(1).isUndef()) {
23973       Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23974       Index = Index.getOperand(0);
23975     } else
23976       return Op;
23977     SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23978     SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23979       DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23980       N->getMemOperand());
23981
23982     SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23983     return DAG.getMergeValues(RetOps, dl);
23984
23985   }
23986   return Op;
23987 }
23988
23989 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23990                                                     SelectionDAG &DAG) const {
23991   // TODO: Eventually, the lowering of these nodes should be informed by or
23992   // deferred to the GC strategy for the function in which they appear. For
23993   // now, however, they must be lowered to something. Since they are logically
23994   // no-ops in the case of a null GC strategy (or a GC strategy which does not
23995   // require special handling for these nodes), lower them as literal NOOPs for
23996   // the time being.
23997   SmallVector<SDValue, 2> Ops;
23998
23999   Ops.push_back(Op.getOperand(0));
24000   if (Op->getGluedNode())
24001     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24002
24003   SDLoc OpDL(Op);
24004   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24005   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24006
24007   return NOOP;
24008 }
24009
24010 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24011                                                   SelectionDAG &DAG) const {
24012   // TODO: Eventually, the lowering of these nodes should be informed by or
24013   // deferred to the GC strategy for the function in which they appear. For
24014   // now, however, they must be lowered to something. Since they are logically
24015   // no-ops in the case of a null GC strategy (or a GC strategy which does not
24016   // require special handling for these nodes), lower them as literal NOOPs for
24017   // the time being.
24018   SmallVector<SDValue, 2> Ops;
24019
24020   Ops.push_back(Op.getOperand(0));
24021   if (Op->getGluedNode())
24022     Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24023
24024   SDLoc OpDL(Op);
24025   SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24026   SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24027
24028   return NOOP;
24029 }
24030
24031 /// Provide custom lowering hooks for some operations.
24032 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24033   switch (Op.getOpcode()) {
24034   default: llvm_unreachable("Should not custom lower this!");
24035   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24036   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24037     return LowerCMP_SWAP(Op, Subtarget, DAG);
24038   case ISD::CTPOP:              return LowerCTPOP(Op, Subtarget, DAG);
24039   case ISD::ATOMIC_LOAD_ADD:
24040   case ISD::ATOMIC_LOAD_SUB:
24041   case ISD::ATOMIC_LOAD_OR:
24042   case ISD::ATOMIC_LOAD_XOR:
24043   case ISD::ATOMIC_LOAD_AND:    return lowerAtomicArith(Op, DAG, Subtarget);
24044   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op, DAG);
24045   case ISD::BITREVERSE:         return LowerBITREVERSE(Op, Subtarget, DAG);
24046   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
24047   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24048   case ISD::VECTOR_SHUFFLE:     return lowerVectorShuffle(Op, Subtarget, DAG);
24049   case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
24050   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24051   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
24052   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24053   case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24054   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24055   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
24056   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
24057   case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
24058   case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
24059   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
24060   case ISD::SHL_PARTS:
24061   case ISD::SRA_PARTS:
24062   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
24063   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
24064   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
24065   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
24066   case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
24067   case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24068   case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
24069   case ISD::ZERO_EXTEND_VECTOR_INREG:
24070   case ISD::SIGN_EXTEND_VECTOR_INREG:
24071     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24072   case ISD::FP_TO_SINT:
24073   case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
24074   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
24075   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
24076   case ISD::FABS:
24077   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
24078   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
24079   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
24080   case ISD::SETCC:              return LowerSETCC(Op, DAG);
24081   case ISD::SETCCCARRY:         return LowerSETCCCARRY(Op, DAG);
24082   case ISD::SELECT:             return LowerSELECT(Op, DAG);
24083   case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
24084   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
24085   case ISD::VASTART:            return LowerVASTART(Op, DAG);
24086   case ISD::VAARG:              return LowerVAARG(Op, DAG);
24087   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
24088   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
24089   case ISD::INTRINSIC_VOID:
24090   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24091   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
24092   case ISD::ADDROFRETURNADDR:   return LowerADDROFRETURNADDR(Op, DAG);
24093   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
24094   case ISD::FRAME_TO_ARGS_OFFSET:
24095                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24096   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24097   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
24098   case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
24099   case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
24100   case ISD::EH_SJLJ_SETUP_DISPATCH:
24101     return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24102   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
24103   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
24104   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
24105   case ISD::CTLZ:
24106   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ(Op, Subtarget, DAG);
24107   case ISD::CTTZ:
24108   case ISD::CTTZ_ZERO_UNDEF:    return LowerCTTZ(Op, DAG);
24109   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
24110   case ISD::MULHS:
24111   case ISD::MULHU:              return LowerMULH(Op, Subtarget, DAG);
24112   case ISD::UMUL_LOHI:
24113   case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
24114   case ISD::ROTL:
24115   case ISD::ROTR:               return LowerRotate(Op, Subtarget, DAG);
24116   case ISD::SRA:
24117   case ISD::SRL:
24118   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
24119   case ISD::SADDO:
24120   case ISD::UADDO:
24121   case ISD::SSUBO:
24122   case ISD::USUBO:
24123   case ISD::SMULO:
24124   case ISD::UMULO:              return LowerXALUO(Op, DAG);
24125   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24126   case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
24127   case ISD::ADDCARRY:
24128   case ISD::SUBCARRY:           return LowerADDSUBCARRY(Op, DAG);
24129   case ISD::ADD:
24130   case ISD::SUB:                return LowerADD_SUB(Op, DAG);
24131   case ISD::SMAX:
24132   case ISD::SMIN:
24133   case ISD::UMAX:
24134   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
24135   case ISD::ABS:                return LowerABS(Op, DAG);
24136   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
24137   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
24138   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
24139   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
24140   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
24141   case ISD::GC_TRANSITION_START:
24142                                 return LowerGC_TRANSITION_START(Op, DAG);
24143   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
24144   case ISD::STORE:              return LowerTruncatingStore(Op, Subtarget, DAG);
24145   }
24146 }
24147
24148 /// Places new result values for the node in Results (their number
24149 /// and types must exactly match those of the original return values of
24150 /// the node), or leaves Results empty, which indicates that the node is not
24151 /// to be custom lowered after all.
24152 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24153                                               SmallVectorImpl<SDValue> &Results,
24154                                               SelectionDAG &DAG) const {
24155   SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24156
24157   if (!Res.getNode())
24158     return;
24159
24160   assert((N->getNumValues() <= Res->getNumValues()) &&
24161       "Lowering returned the wrong number of results!");
24162
24163   // Places new result values base on N result number.
24164   // In some cases (LowerSINT_TO_FP for example) Res has more result values
24165   // than original node, chain should be dropped(last value).
24166   for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24167     Results.push_back(Res.getValue(I));
24168 }
24169
24170 /// Replace a node with an illegal result type with a new node built out of
24171 /// custom code.
24172 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24173                                            SmallVectorImpl<SDValue>&Results,
24174                                            SelectionDAG &DAG) const {
24175   SDLoc dl(N);
24176   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24177   switch (N->getOpcode()) {
24178   default:
24179     llvm_unreachable("Do not know how to custom type legalize this operation!");
24180   case X86ISD::AVG: {
24181     // Legalize types for X86ISD::AVG by expanding vectors.
24182     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24183
24184     auto InVT = N->getValueType(0);
24185     auto InVTSize = InVT.getSizeInBits();
24186     const unsigned RegSize =
24187         (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24188     assert((Subtarget.hasBWI() || RegSize < 512) &&
24189            "512-bit vector requires AVX512BW");
24190     assert((Subtarget.hasAVX2() || RegSize < 256) &&
24191            "256-bit vector requires AVX2");
24192
24193     auto ElemVT = InVT.getVectorElementType();
24194     auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24195                                   RegSize / ElemVT.getSizeInBits());
24196     assert(RegSize % InVT.getSizeInBits() == 0);
24197     unsigned NumConcat = RegSize / InVT.getSizeInBits();
24198
24199     SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24200     Ops[0] = N->getOperand(0);
24201     SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24202     Ops[0] = N->getOperand(1);
24203     SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24204
24205     SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24206     Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24207                                   DAG.getIntPtrConstant(0, dl)));
24208     return;
24209   }
24210   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24211   case X86ISD::FMINC:
24212   case X86ISD::FMIN:
24213   case X86ISD::FMAXC:
24214   case X86ISD::FMAX: {
24215     EVT VT = N->getValueType(0);
24216     assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24217     SDValue UNDEF = DAG.getUNDEF(VT);
24218     SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24219                               N->getOperand(0), UNDEF);
24220     SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24221                               N->getOperand(1), UNDEF);
24222     Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24223     return;
24224   }
24225   case ISD::SDIV:
24226   case ISD::UDIV:
24227   case ISD::SREM:
24228   case ISD::UREM:
24229   case ISD::SDIVREM:
24230   case ISD::UDIVREM: {
24231     SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24232     Results.push_back(V);
24233     return;
24234   }
24235   case ISD::FP_TO_SINT:
24236   case ISD::FP_TO_UINT: {
24237     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24238
24239     if (N->getValueType(0) == MVT::v2i32) {
24240       assert((IsSigned || Subtarget.hasAVX512()) &&
24241              "Can only handle signed conversion without AVX512");
24242       assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24243       SDValue Src = N->getOperand(0);
24244       if (Src.getValueType() == MVT::v2f64) {
24245         SDValue Idx = DAG.getIntPtrConstant(0, dl);
24246         SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24247                                            : X86ISD::CVTTP2UI,
24248                                   dl, MVT::v4i32, Src);
24249         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24250         Results.push_back(Res);
24251         return;
24252       }
24253       if (Src.getValueType() == MVT::v2f32) {
24254         SDValue Idx = DAG.getIntPtrConstant(0, dl);
24255         SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24256                                   DAG.getUNDEF(MVT::v2f32));
24257         Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24258                                    : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24259         Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24260         Results.push_back(Res);
24261         return;
24262       }
24263
24264       // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24265       // so early out here.
24266       return;
24267     }
24268
24269     std::pair<SDValue,SDValue> Vals =
24270         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24271     SDValue FIST = Vals.first, StackSlot = Vals.second;
24272     if (FIST.getNode()) {
24273       EVT VT = N->getValueType(0);
24274       // Return a load from the stack slot.
24275       if (StackSlot.getNode())
24276         Results.push_back(
24277             DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24278       else
24279         Results.push_back(FIST);
24280     }
24281     return;
24282   }
24283   case ISD::SINT_TO_FP: {
24284     assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24285     SDValue Src = N->getOperand(0);
24286     if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24287       return;
24288     Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24289     return;
24290   }
24291   case ISD::UINT_TO_FP: {
24292     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24293     EVT VT = N->getValueType(0);
24294     if (VT != MVT::v2f32)
24295       return;
24296     SDValue Src = N->getOperand(0);
24297     EVT SrcVT = Src.getValueType();
24298     if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24299       Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24300       return;
24301     }
24302     if (SrcVT != MVT::v2i32)
24303       return;
24304     SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24305     SDValue VBias =
24306         DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24307     SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24308                              DAG.getBitcast(MVT::v2i64, VBias));
24309     Or = DAG.getBitcast(MVT::v2f64, Or);
24310     // TODO: Are there any fast-math-flags to propagate here?
24311     SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24312     Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24313     return;
24314   }
24315   case ISD::FP_ROUND: {
24316     if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24317         return;
24318     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24319     Results.push_back(V);
24320     return;
24321   }
24322   case ISD::FP_EXTEND: {
24323     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24324     // No other ValueType for FP_EXTEND should reach this point.
24325     assert(N->getValueType(0) == MVT::v2f32 &&
24326            "Do not know how to legalize this Node");
24327     return;
24328   }
24329   case ISD::INTRINSIC_W_CHAIN: {
24330     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24331     switch (IntNo) {
24332     default : llvm_unreachable("Do not know how to custom type "
24333                                "legalize this intrinsic operation!");
24334     case Intrinsic::x86_rdtsc:
24335       return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24336                                      Results);
24337     case Intrinsic::x86_rdtscp:
24338       return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24339                                      Results);
24340     case Intrinsic::x86_rdpmc:
24341       return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24342
24343     case Intrinsic::x86_xgetbv:
24344       return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24345     }
24346   }
24347   case ISD::INTRINSIC_WO_CHAIN: {
24348     if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24349       Results.push_back(V);
24350     return;
24351   }
24352   case ISD::READCYCLECOUNTER: {
24353     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24354                                    Results);
24355   }
24356   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24357     EVT T = N->getValueType(0);
24358     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24359     bool Regs64bit = T == MVT::i128;
24360     MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24361     SDValue cpInL, cpInH;
24362     cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24363                         DAG.getConstant(0, dl, HalfT));
24364     cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24365                         DAG.getConstant(1, dl, HalfT));
24366     cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24367                              Regs64bit ? X86::RAX : X86::EAX,
24368                              cpInL, SDValue());
24369     cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24370                              Regs64bit ? X86::RDX : X86::EDX,
24371                              cpInH, cpInL.getValue(1));
24372     SDValue swapInL, swapInH;
24373     swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24374                           DAG.getConstant(0, dl, HalfT));
24375     swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24376                           DAG.getConstant(1, dl, HalfT));
24377     swapInH =
24378         DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24379                          swapInH, cpInH.getValue(1));
24380     // If the current function needs the base pointer, RBX,
24381     // we shouldn't use cmpxchg directly.
24382     // Indeed the lowering of that instruction will clobber
24383     // that register and since RBX will be a reserved register
24384     // the register allocator will not make sure its value will
24385     // be properly saved and restored around this live-range.
24386     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24387     SDValue Result;
24388     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24389     unsigned BasePtr = TRI->getBaseRegister();
24390     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24391     if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24392         (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24393       // ISel prefers the LCMPXCHG64 variant.
24394       // If that assert breaks, that means it is not the case anymore,
24395       // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24396       // not just EBX. This is a matter of accepting i64 input for that
24397       // pseudo, and restoring into the register of the right wide
24398       // in expand pseudo. Everything else should just work.
24399       assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24400              "Saving only half of the RBX");
24401       unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24402                                   : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24403       SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24404                                            Regs64bit ? X86::RBX : X86::EBX,
24405                                            HalfT, swapInH.getValue(1));
24406       SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24407                        RBXSave,
24408                        /*Glue*/ RBXSave.getValue(2)};
24409       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24410     } else {
24411       unsigned Opcode =
24412           Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24413       swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24414                                  Regs64bit ? X86::RBX : X86::EBX, swapInL,
24415                                  swapInH.getValue(1));
24416       SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24417                        swapInL.getValue(1)};
24418       Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24419     }
24420     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24421                                         Regs64bit ? X86::RAX : X86::EAX,
24422                                         HalfT, Result.getValue(1));
24423     SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24424                                         Regs64bit ? X86::RDX : X86::EDX,
24425                                         HalfT, cpOutL.getValue(2));
24426     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24427
24428     SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24429                                         MVT::i32, cpOutH.getValue(2));
24430     SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24431     Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24432
24433     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24434     Results.push_back(Success);
24435     Results.push_back(EFLAGS.getValue(1));
24436     return;
24437   }
24438   case ISD::ATOMIC_SWAP:
24439   case ISD::ATOMIC_LOAD_ADD:
24440   case ISD::ATOMIC_LOAD_SUB:
24441   case ISD::ATOMIC_LOAD_AND:
24442   case ISD::ATOMIC_LOAD_OR:
24443   case ISD::ATOMIC_LOAD_XOR:
24444   case ISD::ATOMIC_LOAD_NAND:
24445   case ISD::ATOMIC_LOAD_MIN:
24446   case ISD::ATOMIC_LOAD_MAX:
24447   case ISD::ATOMIC_LOAD_UMIN:
24448   case ISD::ATOMIC_LOAD_UMAX:
24449   case ISD::ATOMIC_LOAD: {
24450     // Delegate to generic TypeLegalization. Situations we can really handle
24451     // should have already been dealt with by AtomicExpandPass.cpp.
24452     break;
24453   }
24454   case ISD::BITCAST: {
24455     assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24456     EVT DstVT = N->getValueType(0);
24457     EVT SrcVT = N->getOperand(0)->getValueType(0);
24458
24459     if (SrcVT != MVT::f64 ||
24460         (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24461       return;
24462
24463     unsigned NumElts = DstVT.getVectorNumElements();
24464     EVT SVT = DstVT.getVectorElementType();
24465     EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24466     SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24467                                    MVT::v2f64, N->getOperand(0));
24468     SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24469
24470     if (ExperimentalVectorWideningLegalization) {
24471       // If we are legalizing vectors by widening, we already have the desired
24472       // legal vector type, just return it.
24473       Results.push_back(ToVecInt);
24474       return;
24475     }
24476
24477     SmallVector<SDValue, 8> Elts;
24478     for (unsigned i = 0, e = NumElts; i != e; ++i)
24479       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24480                                    ToVecInt, DAG.getIntPtrConstant(i, dl)));
24481
24482     Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24483   }
24484   }
24485 }
24486
24487 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24488   switch ((X86ISD::NodeType)Opcode) {
24489   case X86ISD::FIRST_NUMBER:       break;
24490   case X86ISD::BSF:                return "X86ISD::BSF";
24491   case X86ISD::BSR:                return "X86ISD::BSR";
24492   case X86ISD::SHLD:               return "X86ISD::SHLD";
24493   case X86ISD::SHRD:               return "X86ISD::SHRD";
24494   case X86ISD::FAND:               return "X86ISD::FAND";
24495   case X86ISD::FANDN:              return "X86ISD::FANDN";
24496   case X86ISD::FOR:                return "X86ISD::FOR";
24497   case X86ISD::FXOR:               return "X86ISD::FXOR";
24498   case X86ISD::FILD:               return "X86ISD::FILD";
24499   case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
24500   case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24501   case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24502   case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24503   case X86ISD::FLD:                return "X86ISD::FLD";
24504   case X86ISD::FST:                return "X86ISD::FST";
24505   case X86ISD::CALL:               return "X86ISD::CALL";
24506   case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
24507   case X86ISD::RDTSCP_DAG:         return "X86ISD::RDTSCP_DAG";
24508   case X86ISD::RDPMC_DAG:          return "X86ISD::RDPMC_DAG";
24509   case X86ISD::BT:                 return "X86ISD::BT";
24510   case X86ISD::CMP:                return "X86ISD::CMP";
24511   case X86ISD::COMI:               return "X86ISD::COMI";
24512   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
24513   case X86ISD::CMPM:               return "X86ISD::CMPM";
24514   case X86ISD::CMPMU:              return "X86ISD::CMPMU";
24515   case X86ISD::CMPM_RND:           return "X86ISD::CMPM_RND";
24516   case X86ISD::SETCC:              return "X86ISD::SETCC";
24517   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
24518   case X86ISD::FSETCC:             return "X86ISD::FSETCC";
24519   case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
24520   case X86ISD::FSETCCM_RND:        return "X86ISD::FSETCCM_RND";
24521   case X86ISD::CMOV:               return "X86ISD::CMOV";
24522   case X86ISD::BRCOND:             return "X86ISD::BRCOND";
24523   case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
24524   case X86ISD::IRET:               return "X86ISD::IRET";
24525   case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
24526   case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
24527   case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
24528   case X86ISD::Wrapper:            return "X86ISD::Wrapper";
24529   case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
24530   case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
24531   case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
24532   case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
24533   case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
24534   case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
24535   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
24536   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
24537   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
24538   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
24539   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
24540   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
24541   case X86ISD::SHRUNKBLEND:        return "X86ISD::SHRUNKBLEND";
24542   case X86ISD::ADDUS:              return "X86ISD::ADDUS";
24543   case X86ISD::SUBUS:              return "X86ISD::SUBUS";
24544   case X86ISD::HADD:               return "X86ISD::HADD";
24545   case X86ISD::HSUB:               return "X86ISD::HSUB";
24546   case X86ISD::FHADD:              return "X86ISD::FHADD";
24547   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
24548   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
24549   case X86ISD::FMAX:               return "X86ISD::FMAX";
24550   case X86ISD::FMAXS:              return "X86ISD::FMAXS";
24551   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
24552   case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
24553   case X86ISD::FMIN:               return "X86ISD::FMIN";
24554   case X86ISD::FMINS:              return "X86ISD::FMINS";
24555   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
24556   case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
24557   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
24558   case X86ISD::FMINC:              return "X86ISD::FMINC";
24559   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
24560   case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
24561   case X86ISD::FRCP:               return "X86ISD::FRCP";
24562   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
24563   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
24564   case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
24565   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
24566   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
24567   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
24568   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
24569   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
24570   case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24571     return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24572   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
24573   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
24574   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
24575   case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
24576   case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
24577   case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
24578   case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
24579   case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24580     return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24581   case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24582     return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24583   case X86ISD::LADD:               return "X86ISD::LADD";
24584   case X86ISD::LSUB:               return "X86ISD::LSUB";
24585   case X86ISD::LOR:                return "X86ISD::LOR";
24586   case X86ISD::LXOR:               return "X86ISD::LXOR";
24587   case X86ISD::LAND:               return "X86ISD::LAND";
24588   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
24589   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
24590   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
24591   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
24592   case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
24593   case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
24594   case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
24595   case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
24596   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
24597   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
24598   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
24599   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
24600   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
24601   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
24602   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
24603   case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
24604   case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
24605   case X86ISD::CVT2MASK:           return "X86ISD::CVT2MASK";
24606   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
24607   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
24608   case X86ISD::VSHL:               return "X86ISD::VSHL";
24609   case X86ISD::VSRL:               return "X86ISD::VSRL";
24610   case X86ISD::VSRA:               return "X86ISD::VSRA";
24611   case X86ISD::VSHLI:              return "X86ISD::VSHLI";
24612   case X86ISD::VSRLI:              return "X86ISD::VSRLI";
24613   case X86ISD::VSRAI:              return "X86ISD::VSRAI";
24614   case X86ISD::VSRAV:              return "X86ISD::VSRAV";
24615   case X86ISD::VROTLI:             return "X86ISD::VROTLI";
24616   case X86ISD::VROTRI:             return "X86ISD::VROTRI";
24617   case X86ISD::VPPERM:             return "X86ISD::VPPERM";
24618   case X86ISD::CMPP:               return "X86ISD::CMPP";
24619   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
24620   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
24621   case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
24622   case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
24623   case X86ISD::ADD:                return "X86ISD::ADD";
24624   case X86ISD::SUB:                return "X86ISD::SUB";
24625   case X86ISD::ADC:                return "X86ISD::ADC";
24626   case X86ISD::SBB:                return "X86ISD::SBB";
24627   case X86ISD::SMUL:               return "X86ISD::SMUL";
24628   case X86ISD::UMUL:               return "X86ISD::UMUL";
24629   case X86ISD::SMUL8:              return "X86ISD::SMUL8";
24630   case X86ISD::UMUL8:              return "X86ISD::UMUL8";
24631   case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24632   case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24633   case X86ISD::INC:                return "X86ISD::INC";
24634   case X86ISD::DEC:                return "X86ISD::DEC";
24635   case X86ISD::OR:                 return "X86ISD::OR";
24636   case X86ISD::XOR:                return "X86ISD::XOR";
24637   case X86ISD::AND:                return "X86ISD::AND";
24638   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
24639   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
24640   case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
24641   case X86ISD::PTEST:              return "X86ISD::PTEST";
24642   case X86ISD::TESTP:              return "X86ISD::TESTP";
24643   case X86ISD::TESTM:              return "X86ISD::TESTM";
24644   case X86ISD::TESTNM:             return "X86ISD::TESTNM";
24645   case X86ISD::KORTEST:            return "X86ISD::KORTEST";
24646   case X86ISD::KTEST:              return "X86ISD::KTEST";
24647   case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
24648   case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
24649   case X86ISD::PACKSS:             return "X86ISD::PACKSS";
24650   case X86ISD::PACKUS:             return "X86ISD::PACKUS";
24651   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
24652   case X86ISD::VALIGN:             return "X86ISD::VALIGN";
24653   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
24654   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
24655   case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
24656   case X86ISD::SHUFP:              return "X86ISD::SHUFP";
24657   case X86ISD::SHUF128:            return "X86ISD::SHUF128";
24658   case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
24659   case X86ISD::MOVLHPD:            return "X86ISD::MOVLHPD";
24660   case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
24661   case X86ISD::MOVLPS:             return "X86ISD::MOVLPS";
24662   case X86ISD::MOVLPD:             return "X86ISD::MOVLPD";
24663   case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
24664   case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
24665   case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
24666   case X86ISD::MOVSD:              return "X86ISD::MOVSD";
24667   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
24668   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
24669   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
24670   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
24671   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
24672   case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
24673   case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
24674   case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
24675   case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
24676   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
24677   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
24678   case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
24679   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
24680   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
24681   case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
24682   case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
24683   case X86ISD::VFIXUPIMMS:          return "X86ISD::VFIXUPIMMS";
24684   case X86ISD::VRANGE:             return "X86ISD::VRANGE";
24685   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
24686   case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
24687   case X86ISD::PSADBW:             return "X86ISD::PSADBW";
24688   case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
24689   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24690   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
24691   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
24692   case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
24693   case X86ISD::MFENCE:             return "X86ISD::MFENCE";
24694   case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
24695   case X86ISD::SAHF:               return "X86ISD::SAHF";
24696   case X86ISD::RDRAND:             return "X86ISD::RDRAND";
24697   case X86ISD::RDSEED:             return "X86ISD::RDSEED";
24698   case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
24699   case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
24700   case X86ISD::VPROT:              return "X86ISD::VPROT";
24701   case X86ISD::VPROTI:             return "X86ISD::VPROTI";
24702   case X86ISD::VPSHA:              return "X86ISD::VPSHA";
24703   case X86ISD::VPSHL:              return "X86ISD::VPSHL";
24704   case X86ISD::VPCOM:              return "X86ISD::VPCOM";
24705   case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
24706   case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
24707   case X86ISD::FMADD:              return "X86ISD::FMADD";
24708   case X86ISD::FMSUB:              return "X86ISD::FMSUB";
24709   case X86ISD::FNMADD:             return "X86ISD::FNMADD";
24710   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
24711   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
24712   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
24713   case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
24714   case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
24715   case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
24716   case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
24717   case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
24718   case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
24719   case X86ISD::FMADDS1_RND:        return "X86ISD::FMADDS1_RND";
24720   case X86ISD::FNMADDS1_RND:       return "X86ISD::FNMADDS1_RND";
24721   case X86ISD::FMSUBS1_RND:        return "X86ISD::FMSUBS1_RND";
24722   case X86ISD::FNMSUBS1_RND:       return "X86ISD::FNMSUBS1_RND";
24723   case X86ISD::FMADDS3_RND:        return "X86ISD::FMADDS3_RND";
24724   case X86ISD::FNMADDS3_RND:       return "X86ISD::FNMADDS3_RND";
24725   case X86ISD::FMSUBS3_RND:        return "X86ISD::FMSUBS3_RND";
24726   case X86ISD::FNMSUBS3_RND:       return "X86ISD::FNMSUBS3_RND";
24727   case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
24728   case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
24729   case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
24730   case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
24731   case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
24732   case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
24733   case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
24734   case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
24735   case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
24736   case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
24737   case X86ISD::XTEST:              return "X86ISD::XTEST";
24738   case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
24739   case X86ISD::EXPAND:             return "X86ISD::EXPAND";
24740   case X86ISD::SELECT:             return "X86ISD::SELECT";
24741   case X86ISD::SELECTS:            return "X86ISD::SELECTS";
24742   case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
24743   case X86ISD::RCP28:              return "X86ISD::RCP28";
24744   case X86ISD::RCP28S:             return "X86ISD::RCP28S";
24745   case X86ISD::EXP2:               return "X86ISD::EXP2";
24746   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
24747   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
24748   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
24749   case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
24750   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
24751   case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
24752   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
24753   case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
24754   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
24755   case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
24756   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
24757   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
24758   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
24759   case X86ISD::FGETEXPS_RND:       return "X86ISD::FGETEXPS_RND";
24760   case X86ISD::SCALEF:             return "X86ISD::SCALEF";
24761   case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
24762   case X86ISD::ADDS:               return "X86ISD::ADDS";
24763   case X86ISD::SUBS:               return "X86ISD::SUBS";
24764   case X86ISD::AVG:                return "X86ISD::AVG";
24765   case X86ISD::MULHRS:             return "X86ISD::MULHRS";
24766   case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
24767   case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
24768   case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
24769   case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
24770   case X86ISD::CVTTP2SI_RND:       return "X86ISD::CVTTP2SI_RND";
24771   case X86ISD::CVTTP2UI_RND:       return "X86ISD::CVTTP2UI_RND";
24772   case X86ISD::CVTTS2SI_RND:       return "X86ISD::CVTTS2SI_RND";
24773   case X86ISD::CVTTS2UI_RND:       return "X86ISD::CVTTS2UI_RND";
24774   case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
24775   case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
24776   case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
24777   case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
24778   case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
24779   case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24780   case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24781   case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
24782   case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
24783   case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
24784   case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
24785   case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
24786   case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
24787   case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
24788   case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
24789   case X86ISD::LWPINS:             return "X86ISD::LWPINS";
24790   case X86ISD::MGATHER:            return "X86ISD::MGATHER";
24791   }
24792   return nullptr;
24793 }
24794
24795 /// Return true if the addressing mode represented by AM is legal for this
24796 /// target, for a load/store of the specified type.
24797 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24798                                               const AddrMode &AM, Type *Ty,
24799                                               unsigned AS) const {
24800   // X86 supports extremely general addressing modes.
24801   CodeModel::Model M = getTargetMachine().getCodeModel();
24802
24803   // X86 allows a sign-extended 32-bit immediate field as a displacement.
24804   if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24805     return false;
24806
24807   if (AM.BaseGV) {
24808     unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24809
24810     // If a reference to this global requires an extra load, we can't fold it.
24811     if (isGlobalStubReference(GVFlags))
24812       return false;
24813
24814     // If BaseGV requires a register for the PIC base, we cannot also have a
24815     // BaseReg specified.
24816     if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24817       return false;
24818
24819     // If lower 4G is not available, then we must use rip-relative addressing.
24820     if ((M != CodeModel::Small || isPositionIndependent()) &&
24821         Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24822       return false;
24823   }
24824
24825   switch (AM.Scale) {
24826   case 0:
24827   case 1:
24828   case 2:
24829   case 4:
24830   case 8:
24831     // These scales always work.
24832     break;
24833   case 3:
24834   case 5:
24835   case 9:
24836     // These scales are formed with basereg+scalereg.  Only accept if there is
24837     // no basereg yet.
24838     if (AM.HasBaseReg)
24839       return false;
24840     break;
24841   default:  // Other stuff never works.
24842     return false;
24843   }
24844
24845   return true;
24846 }
24847
24848 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24849   unsigned Bits = Ty->getScalarSizeInBits();
24850
24851   // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24852   // particularly cheaper than those without.
24853   if (Bits == 8)
24854     return false;
24855
24856   // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24857   // variable shifts just as cheap as scalar ones.
24858   if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24859     return false;
24860
24861   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24862   // fully general vector.
24863   return true;
24864 }
24865
24866 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24867   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24868     return false;
24869   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24870   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24871   return NumBits1 > NumBits2;
24872 }
24873
24874 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24875   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24876     return false;
24877
24878   if (!isTypeLegal(EVT::getEVT(Ty1)))
24879     return false;
24880
24881   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24882
24883   // Assuming the caller doesn't have a zeroext or signext return parameter,
24884   // truncation all the way down to i1 is valid.
24885   return true;
24886 }
24887
24888 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24889   return isInt<32>(Imm);
24890 }
24891
24892 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24893   // Can also use sub to handle negated immediates.
24894   return isInt<32>(Imm);
24895 }
24896
24897 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24898   if (!VT1.isInteger() || !VT2.isInteger())
24899     return false;
24900   unsigned NumBits1 = VT1.getSizeInBits();
24901   unsigned NumBits2 = VT2.getSizeInBits();
24902   return NumBits1 > NumBits2;
24903 }
24904
24905 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24906   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24907   return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24908 }
24909
24910 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24911   // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24912   return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24913 }
24914
24915 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24916   EVT VT1 = Val.getValueType();
24917   if (isZExtFree(VT1, VT2))
24918     return true;
24919
24920   if (Val.getOpcode() != ISD::LOAD)
24921     return false;
24922
24923   if (!VT1.isSimple() || !VT1.isInteger() ||
24924       !VT2.isSimple() || !VT2.isInteger())
24925     return false;
24926
24927   switch (VT1.getSimpleVT().SimpleTy) {
24928   default: break;
24929   case MVT::i8:
24930   case MVT::i16:
24931   case MVT::i32:
24932     // X86 has 8, 16, and 32-bit zero-extending loads.
24933     return true;
24934   }
24935
24936   return false;
24937 }
24938
24939 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24940
24941 bool
24942 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24943   if (!Subtarget.hasAnyFMA())
24944     return false;
24945
24946   VT = VT.getScalarType();
24947
24948   if (!VT.isSimple())
24949     return false;
24950
24951   switch (VT.getSimpleVT().SimpleTy) {
24952   case MVT::f32:
24953   case MVT::f64:
24954     return true;
24955   default:
24956     break;
24957   }
24958
24959   return false;
24960 }
24961
24962 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24963   // i16 instructions are longer (0x66 prefix) and potentially slower.
24964   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24965 }
24966
24967 /// Targets can use this to indicate that they only support *some*
24968 /// VECTOR_SHUFFLE operations, those with specific masks.
24969 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24970 /// are assumed to be legal.
24971 bool
24972 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24973                                       EVT VT) const {
24974   if (!VT.isSimple())
24975     return false;
24976
24977   // Not for i1 vectors
24978   if (VT.getSimpleVT().getScalarType() == MVT::i1)
24979     return false;
24980
24981   // Very little shuffling can be done for 64-bit vectors right now.
24982   if (VT.getSimpleVT().getSizeInBits() == 64)
24983     return false;
24984
24985   // We only care that the types being shuffled are legal. The lowering can
24986   // handle any possible shuffle mask that results.
24987   return isTypeLegal(VT.getSimpleVT());
24988 }
24989
24990 bool
24991 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24992                                           EVT VT) const {
24993   // Just delegate to the generic legality, clear masks aren't special.
24994   return isShuffleMaskLegal(Mask, VT);
24995 }
24996
24997 //===----------------------------------------------------------------------===//
24998 //                           X86 Scheduler Hooks
24999 //===----------------------------------------------------------------------===//
25000
25001 /// Utility function to emit xbegin specifying the start of an RTM region.
25002 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25003                                      const TargetInstrInfo *TII) {
25004   DebugLoc DL = MI.getDebugLoc();
25005
25006   const BasicBlock *BB = MBB->getBasicBlock();
25007   MachineFunction::iterator I = ++MBB->getIterator();
25008
25009   // For the v = xbegin(), we generate
25010   //
25011   // thisMBB:
25012   //  xbegin sinkMBB
25013   //
25014   // mainMBB:
25015   //  s0 = -1
25016   //
25017   // fallBB:
25018   //  eax = # XABORT_DEF
25019   //  s1 = eax
25020   //
25021   // sinkMBB:
25022   //  v = phi(s0/mainBB, s1/fallBB)
25023
25024   MachineBasicBlock *thisMBB = MBB;
25025   MachineFunction *MF = MBB->getParent();
25026   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25027   MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25028   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25029   MF->insert(I, mainMBB);
25030   MF->insert(I, fallMBB);
25031   MF->insert(I, sinkMBB);
25032
25033   // Transfer the remainder of BB and its successor edges to sinkMBB.
25034   sinkMBB->splice(sinkMBB->begin(), MBB,
25035                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25036   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25037
25038   MachineRegisterInfo &MRI = MF->getRegInfo();
25039   unsigned DstReg = MI.getOperand(0).getReg();
25040   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25041   unsigned mainDstReg = MRI.createVirtualRegister(RC);
25042   unsigned fallDstReg = MRI.createVirtualRegister(RC);
25043
25044   // thisMBB:
25045   //  xbegin fallMBB
25046   //  # fallthrough to mainMBB
25047   //  # abortion to fallMBB
25048   BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25049   thisMBB->addSuccessor(mainMBB);
25050   thisMBB->addSuccessor(fallMBB);
25051
25052   // mainMBB:
25053   //  mainDstReg := -1
25054   BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25055   BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25056   mainMBB->addSuccessor(sinkMBB);
25057
25058   // fallMBB:
25059   //  ; pseudo instruction to model hardware's definition from XABORT
25060   //  EAX := XABORT_DEF
25061   //  fallDstReg := EAX
25062   BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25063   BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25064       .addReg(X86::EAX);
25065   fallMBB->addSuccessor(sinkMBB);
25066
25067   // sinkMBB:
25068   //  DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25069   BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25070       .addReg(mainDstReg).addMBB(mainMBB)
25071       .addReg(fallDstReg).addMBB(fallMBB);
25072
25073   MI.eraseFromParent();
25074   return sinkMBB;
25075 }
25076
25077 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25078 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25079 // in the .td file.
25080 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25081                                        const TargetInstrInfo *TII) {
25082   unsigned Opc;
25083   switch (MI.getOpcode()) {
25084   default: llvm_unreachable("illegal opcode!");
25085   case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
25086   case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25087   case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
25088   case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25089   case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
25090   case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25091   case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
25092   case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25093   }
25094
25095   DebugLoc dl = MI.getDebugLoc();
25096   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25097
25098   unsigned NumArgs = MI.getNumOperands();
25099   for (unsigned i = 1; i < NumArgs; ++i) {
25100     MachineOperand &Op = MI.getOperand(i);
25101     if (!(Op.isReg() && Op.isImplicit()))
25102       MIB.add(Op);
25103   }
25104   if (MI.hasOneMemOperand())
25105     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25106
25107   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25108       .addReg(X86::XMM0);
25109
25110   MI.eraseFromParent();
25111   return BB;
25112 }
25113
25114 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25115 // defs in an instruction pattern
25116 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25117                                        const TargetInstrInfo *TII) {
25118   unsigned Opc;
25119   switch (MI.getOpcode()) {
25120   default: llvm_unreachable("illegal opcode!");
25121   case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
25122   case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25123   case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
25124   case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25125   case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
25126   case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25127   case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
25128   case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25129   }
25130
25131   DebugLoc dl = MI.getDebugLoc();
25132   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25133
25134   unsigned NumArgs = MI.getNumOperands(); // remove the results
25135   for (unsigned i = 1; i < NumArgs; ++i) {
25136     MachineOperand &Op = MI.getOperand(i);
25137     if (!(Op.isReg() && Op.isImplicit()))
25138       MIB.add(Op);
25139   }
25140   if (MI.hasOneMemOperand())
25141     MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25142
25143   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25144       .addReg(X86::ECX);
25145
25146   MI.eraseFromParent();
25147   return BB;
25148 }
25149
25150 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25151                                      const X86Subtarget &Subtarget) {
25152   DebugLoc dl = MI.getDebugLoc();
25153   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25154
25155   // insert input VAL into EAX
25156   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25157       .addReg(MI.getOperand(0).getReg());
25158   // insert zero to ECX
25159   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25160
25161   // insert zero to EDX
25162   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25163
25164   // insert WRPKRU instruction
25165   BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25166
25167   MI.eraseFromParent(); // The pseudo is gone now.
25168   return BB;
25169 }
25170
25171 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25172                                      const X86Subtarget &Subtarget) {
25173   DebugLoc dl = MI.getDebugLoc();
25174   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25175
25176   // insert zero to ECX
25177   BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25178
25179   // insert RDPKRU instruction
25180   BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25181   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25182       .addReg(X86::EAX);
25183
25184   MI.eraseFromParent(); // The pseudo is gone now.
25185   return BB;
25186 }
25187
25188 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25189                                       const X86Subtarget &Subtarget,
25190                                       unsigned Opc) {
25191   DebugLoc dl = MI.getDebugLoc();
25192   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25193   // Address into RAX/EAX, other two args into ECX, EDX.
25194   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25195   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25196   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25197   for (int i = 0; i < X86::AddrNumOperands; ++i)
25198     MIB.add(MI.getOperand(i));
25199
25200   unsigned ValOps = X86::AddrNumOperands;
25201   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25202       .addReg(MI.getOperand(ValOps).getReg());
25203   BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25204       .addReg(MI.getOperand(ValOps + 1).getReg());
25205
25206   // The instruction doesn't actually take any operands though.
25207   BuildMI(*BB, MI, dl, TII->get(Opc));
25208
25209   MI.eraseFromParent(); // The pseudo is gone now.
25210   return BB;
25211 }
25212
25213 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25214                                       const X86Subtarget &Subtarget) {
25215   DebugLoc dl = MI->getDebugLoc();
25216   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25217   // Address into RAX/EAX
25218   unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25219   unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25220   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25221   for (int i = 0; i < X86::AddrNumOperands; ++i)
25222     MIB.add(MI->getOperand(i));
25223
25224   // The instruction doesn't actually take any operands though.
25225   BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25226
25227   MI->eraseFromParent(); // The pseudo is gone now.
25228   return BB;
25229 }
25230
25231
25232
25233 MachineBasicBlock *
25234 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25235                                                  MachineBasicBlock *MBB) const {
25236   // Emit va_arg instruction on X86-64.
25237
25238   // Operands to this pseudo-instruction:
25239   // 0  ) Output        : destination address (reg)
25240   // 1-5) Input         : va_list address (addr, i64mem)
25241   // 6  ) ArgSize       : Size (in bytes) of vararg type
25242   // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25243   // 8  ) Align         : Alignment of type
25244   // 9  ) EFLAGS (implicit-def)
25245
25246   assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25247   static_assert(X86::AddrNumOperands == 5,
25248                 "VAARG_64 assumes 5 address operands");
25249
25250   unsigned DestReg = MI.getOperand(0).getReg();
25251   MachineOperand &Base = MI.getOperand(1);
25252   MachineOperand &Scale = MI.getOperand(2);
25253   MachineOperand &Index = MI.getOperand(3);
25254   MachineOperand &Disp = MI.getOperand(4);
25255   MachineOperand &Segment = MI.getOperand(5);
25256   unsigned ArgSize = MI.getOperand(6).getImm();
25257   unsigned ArgMode = MI.getOperand(7).getImm();
25258   unsigned Align = MI.getOperand(8).getImm();
25259
25260   // Memory Reference
25261   assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25262   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25263   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25264
25265   // Machine Information
25266   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25267   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25268   const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25269   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25270   DebugLoc DL = MI.getDebugLoc();
25271
25272   // struct va_list {
25273   //   i32   gp_offset
25274   //   i32   fp_offset
25275   //   i64   overflow_area (address)
25276   //   i64   reg_save_area (address)
25277   // }
25278   // sizeof(va_list) = 24
25279   // alignment(va_list) = 8
25280
25281   unsigned TotalNumIntRegs = 6;
25282   unsigned TotalNumXMMRegs = 8;
25283   bool UseGPOffset = (ArgMode == 1);
25284   bool UseFPOffset = (ArgMode == 2);
25285   unsigned MaxOffset = TotalNumIntRegs * 8 +
25286                        (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25287
25288   /* Align ArgSize to a multiple of 8 */
25289   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25290   bool NeedsAlign = (Align > 8);
25291
25292   MachineBasicBlock *thisMBB = MBB;
25293   MachineBasicBlock *overflowMBB;
25294   MachineBasicBlock *offsetMBB;
25295   MachineBasicBlock *endMBB;
25296
25297   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
25298   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
25299   unsigned OffsetReg = 0;
25300
25301   if (!UseGPOffset && !UseFPOffset) {
25302     // If we only pull from the overflow region, we don't create a branch.
25303     // We don't need to alter control flow.
25304     OffsetDestReg = 0; // unused
25305     OverflowDestReg = DestReg;
25306
25307     offsetMBB = nullptr;
25308     overflowMBB = thisMBB;
25309     endMBB = thisMBB;
25310   } else {
25311     // First emit code to check if gp_offset (or fp_offset) is below the bound.
25312     // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25313     // If not, pull from overflow_area. (branch to overflowMBB)
25314     //
25315     //       thisMBB
25316     //         |     .
25317     //         |        .
25318     //     offsetMBB   overflowMBB
25319     //         |        .
25320     //         |     .
25321     //        endMBB
25322
25323     // Registers for the PHI in endMBB
25324     OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25325     OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25326
25327     const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25328     MachineFunction *MF = MBB->getParent();
25329     overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25330     offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25331     endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25332
25333     MachineFunction::iterator MBBIter = ++MBB->getIterator();
25334
25335     // Insert the new basic blocks
25336     MF->insert(MBBIter, offsetMBB);
25337     MF->insert(MBBIter, overflowMBB);
25338     MF->insert(MBBIter, endMBB);
25339
25340     // Transfer the remainder of MBB and its successor edges to endMBB.
25341     endMBB->splice(endMBB->begin(), thisMBB,
25342                    std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25343     endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25344
25345     // Make offsetMBB and overflowMBB successors of thisMBB
25346     thisMBB->addSuccessor(offsetMBB);
25347     thisMBB->addSuccessor(overflowMBB);
25348
25349     // endMBB is a successor of both offsetMBB and overflowMBB
25350     offsetMBB->addSuccessor(endMBB);
25351     overflowMBB->addSuccessor(endMBB);
25352
25353     // Load the offset value into a register
25354     OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25355     BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25356         .add(Base)
25357         .add(Scale)
25358         .add(Index)
25359         .addDisp(Disp, UseFPOffset ? 4 : 0)
25360         .add(Segment)
25361         .setMemRefs(MMOBegin, MMOEnd);
25362
25363     // Check if there is enough room left to pull this argument.
25364     BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25365       .addReg(OffsetReg)
25366       .addImm(MaxOffset + 8 - ArgSizeA8);
25367
25368     // Branch to "overflowMBB" if offset >= max
25369     // Fall through to "offsetMBB" otherwise
25370     BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25371       .addMBB(overflowMBB);
25372   }
25373
25374   // In offsetMBB, emit code to use the reg_save_area.
25375   if (offsetMBB) {
25376     assert(OffsetReg != 0);
25377
25378     // Read the reg_save_area address.
25379     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25380     BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25381         .add(Base)
25382         .add(Scale)
25383         .add(Index)
25384         .addDisp(Disp, 16)
25385         .add(Segment)
25386         .setMemRefs(MMOBegin, MMOEnd);
25387
25388     // Zero-extend the offset
25389     unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25390       BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25391         .addImm(0)
25392         .addReg(OffsetReg)
25393         .addImm(X86::sub_32bit);
25394
25395     // Add the offset to the reg_save_area to get the final address.
25396     BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25397       .addReg(OffsetReg64)
25398       .addReg(RegSaveReg);
25399
25400     // Compute the offset for the next argument
25401     unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25402     BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25403       .addReg(OffsetReg)
25404       .addImm(UseFPOffset ? 16 : 8);
25405
25406     // Store it back into the va_list.
25407     BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25408         .add(Base)
25409         .add(Scale)
25410         .add(Index)
25411         .addDisp(Disp, UseFPOffset ? 4 : 0)
25412         .add(Segment)
25413         .addReg(NextOffsetReg)
25414         .setMemRefs(MMOBegin, MMOEnd);
25415
25416     // Jump to endMBB
25417     BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25418       .addMBB(endMBB);
25419   }
25420
25421   //
25422   // Emit code to use overflow area
25423   //
25424
25425   // Load the overflow_area address into a register.
25426   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25427   BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25428       .add(Base)
25429       .add(Scale)
25430       .add(Index)
25431       .addDisp(Disp, 8)
25432       .add(Segment)
25433       .setMemRefs(MMOBegin, MMOEnd);
25434
25435   // If we need to align it, do so. Otherwise, just copy the address
25436   // to OverflowDestReg.
25437   if (NeedsAlign) {
25438     // Align the overflow address
25439     assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25440     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25441
25442     // aligned_addr = (addr + (align-1)) & ~(align-1)
25443     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25444       .addReg(OverflowAddrReg)
25445       .addImm(Align-1);
25446
25447     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25448       .addReg(TmpReg)
25449       .addImm(~(uint64_t)(Align-1));
25450   } else {
25451     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25452       .addReg(OverflowAddrReg);
25453   }
25454
25455   // Compute the next overflow address after this argument.
25456   // (the overflow address should be kept 8-byte aligned)
25457   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25458   BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25459     .addReg(OverflowDestReg)
25460     .addImm(ArgSizeA8);
25461
25462   // Store the new overflow address.
25463   BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25464       .add(Base)
25465       .add(Scale)
25466       .add(Index)
25467       .addDisp(Disp, 8)
25468       .add(Segment)
25469       .addReg(NextAddrReg)
25470       .setMemRefs(MMOBegin, MMOEnd);
25471
25472   // If we branched, emit the PHI to the front of endMBB.
25473   if (offsetMBB) {
25474     BuildMI(*endMBB, endMBB->begin(), DL,
25475             TII->get(X86::PHI), DestReg)
25476       .addReg(OffsetDestReg).addMBB(offsetMBB)
25477       .addReg(OverflowDestReg).addMBB(overflowMBB);
25478   }
25479
25480   // Erase the pseudo instruction
25481   MI.eraseFromParent();
25482
25483   return endMBB;
25484 }
25485
25486 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25487     MachineInstr &MI, MachineBasicBlock *MBB) const {
25488   // Emit code to save XMM registers to the stack. The ABI says that the
25489   // number of registers to save is given in %al, so it's theoretically
25490   // possible to do an indirect jump trick to avoid saving all of them,
25491   // however this code takes a simpler approach and just executes all
25492   // of the stores if %al is non-zero. It's less code, and it's probably
25493   // easier on the hardware branch predictor, and stores aren't all that
25494   // expensive anyway.
25495
25496   // Create the new basic blocks. One block contains all the XMM stores,
25497   // and one block is the final destination regardless of whether any
25498   // stores were performed.
25499   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25500   MachineFunction *F = MBB->getParent();
25501   MachineFunction::iterator MBBIter = ++MBB->getIterator();
25502   MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25503   MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25504   F->insert(MBBIter, XMMSaveMBB);
25505   F->insert(MBBIter, EndMBB);
25506
25507   // Transfer the remainder of MBB and its successor edges to EndMBB.
25508   EndMBB->splice(EndMBB->begin(), MBB,
25509                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25510   EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25511
25512   // The original block will now fall through to the XMM save block.
25513   MBB->addSuccessor(XMMSaveMBB);
25514   // The XMMSaveMBB will fall through to the end block.
25515   XMMSaveMBB->addSuccessor(EndMBB);
25516
25517   // Now add the instructions.
25518   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25519   DebugLoc DL = MI.getDebugLoc();
25520
25521   unsigned CountReg = MI.getOperand(0).getReg();
25522   int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25523   int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25524
25525   if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25526     // If %al is 0, branch around the XMM save block.
25527     BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25528     BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25529     MBB->addSuccessor(EndMBB);
25530   }
25531
25532   // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25533   // that was just emitted, but clearly shouldn't be "saved".
25534   assert((MI.getNumOperands() <= 3 ||
25535           !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25536           MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25537          "Expected last argument to be EFLAGS");
25538   unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25539   // In the XMM save block, save all the XMM argument registers.
25540   for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25541     int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25542     MachineMemOperand *MMO = F->getMachineMemOperand(
25543         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25544         MachineMemOperand::MOStore,
25545         /*Size=*/16, /*Align=*/16);
25546     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25547         .addFrameIndex(RegSaveFrameIndex)
25548         .addImm(/*Scale=*/1)
25549         .addReg(/*IndexReg=*/0)
25550         .addImm(/*Disp=*/Offset)
25551         .addReg(/*Segment=*/0)
25552         .addReg(MI.getOperand(i).getReg())
25553         .addMemOperand(MMO);
25554   }
25555
25556   MI.eraseFromParent(); // The pseudo instruction is gone now.
25557
25558   return EndMBB;
25559 }
25560
25561 // The EFLAGS operand of SelectItr might be missing a kill marker
25562 // because there were multiple uses of EFLAGS, and ISel didn't know
25563 // which to mark. Figure out whether SelectItr should have had a
25564 // kill marker, and set it if it should. Returns the correct kill
25565 // marker value.
25566 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25567                                      MachineBasicBlock* BB,
25568                                      const TargetRegisterInfo* TRI) {
25569   // Scan forward through BB for a use/def of EFLAGS.
25570   MachineBasicBlock::iterator miI(std::next(SelectItr));
25571   for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25572     const MachineInstr& mi = *miI;
25573     if (mi.readsRegister(X86::EFLAGS))
25574       return false;
25575     if (mi.definesRegister(X86::EFLAGS))
25576       break; // Should have kill-flag - update below.
25577   }
25578
25579   // If we hit the end of the block, check whether EFLAGS is live into a
25580   // successor.
25581   if (miI == BB->end()) {
25582     for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25583                                           sEnd = BB->succ_end();
25584          sItr != sEnd; ++sItr) {
25585       MachineBasicBlock* succ = *sItr;
25586       if (succ->isLiveIn(X86::EFLAGS))
25587         return false;
25588     }
25589   }
25590
25591   // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25592   // out. SelectMI should have a kill flag on EFLAGS.
25593   SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25594   return true;
25595 }
25596
25597 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25598 // together with other CMOV pseudo-opcodes into a single basic-block with
25599 // conditional jump around it.
25600 static bool isCMOVPseudo(MachineInstr &MI) {
25601   switch (MI.getOpcode()) {
25602   case X86::CMOV_FR32:
25603   case X86::CMOV_FR64:
25604   case X86::CMOV_GR8:
25605   case X86::CMOV_GR16:
25606   case X86::CMOV_GR32:
25607   case X86::CMOV_RFP32:
25608   case X86::CMOV_RFP64:
25609   case X86::CMOV_RFP80:
25610   case X86::CMOV_V2F64:
25611   case X86::CMOV_V2I64:
25612   case X86::CMOV_V4F32:
25613   case X86::CMOV_V4F64:
25614   case X86::CMOV_V4I64:
25615   case X86::CMOV_V16F32:
25616   case X86::CMOV_V8F32:
25617   case X86::CMOV_V8F64:
25618   case X86::CMOV_V8I64:
25619   case X86::CMOV_V8I1:
25620   case X86::CMOV_V16I1:
25621   case X86::CMOV_V32I1:
25622   case X86::CMOV_V64I1:
25623     return true;
25624
25625   default:
25626     return false;
25627   }
25628 }
25629
25630 MachineBasicBlock *
25631 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25632                                      MachineBasicBlock *BB) const {
25633   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25634   DebugLoc DL = MI.getDebugLoc();
25635
25636   // To "insert" a SELECT_CC instruction, we actually have to insert the
25637   // diamond control-flow pattern.  The incoming instruction knows the
25638   // destination vreg to set, the condition code register to branch on, the
25639   // true/false values to select between, and a branch opcode to use.
25640   const BasicBlock *LLVM_BB = BB->getBasicBlock();
25641   MachineFunction::iterator It = ++BB->getIterator();
25642
25643   //  thisMBB:
25644   //  ...
25645   //   TrueVal = ...
25646   //   cmpTY ccX, r1, r2
25647   //   bCC copy1MBB
25648   //   fallthrough --> copy0MBB
25649   MachineBasicBlock *thisMBB = BB;
25650   MachineFunction *F = BB->getParent();
25651
25652   // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25653   // as described above, by inserting a BB, and then making a PHI at the join
25654   // point to select the true and false operands of the CMOV in the PHI.
25655   //
25656   // The code also handles two different cases of multiple CMOV opcodes
25657   // in a row.
25658   //
25659   // Case 1:
25660   // In this case, there are multiple CMOVs in a row, all which are based on
25661   // the same condition setting (or the exact opposite condition setting).
25662   // In this case we can lower all the CMOVs using a single inserted BB, and
25663   // then make a number of PHIs at the join point to model the CMOVs. The only
25664   // trickiness here, is that in a case like:
25665   //
25666   // t2 = CMOV cond1 t1, f1
25667   // t3 = CMOV cond1 t2, f2
25668   //
25669   // when rewriting this into PHIs, we have to perform some renaming on the
25670   // temps since you cannot have a PHI operand refer to a PHI result earlier
25671   // in the same block.  The "simple" but wrong lowering would be:
25672   //
25673   // t2 = PHI t1(BB1), f1(BB2)
25674   // t3 = PHI t2(BB1), f2(BB2)
25675   //
25676   // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25677   // renaming is to note that on the path through BB1, t2 is really just a
25678   // copy of t1, and do that renaming, properly generating:
25679   //
25680   // t2 = PHI t1(BB1), f1(BB2)
25681   // t3 = PHI t1(BB1), f2(BB2)
25682   //
25683   // Case 2, we lower cascaded CMOVs such as
25684   //
25685   //   (CMOV (CMOV F, T, cc1), T, cc2)
25686   //
25687   // to two successive branches.  For that, we look for another CMOV as the
25688   // following instruction.
25689   //
25690   // Without this, we would add a PHI between the two jumps, which ends up
25691   // creating a few copies all around. For instance, for
25692   //
25693   //    (sitofp (zext (fcmp une)))
25694   //
25695   // we would generate:
25696   //
25697   //         ucomiss %xmm1, %xmm0
25698   //         movss  <1.0f>, %xmm0
25699   //         movaps  %xmm0, %xmm1
25700   //         jne     .LBB5_2
25701   //         xorps   %xmm1, %xmm1
25702   // .LBB5_2:
25703   //         jp      .LBB5_4
25704   //         movaps  %xmm1, %xmm0
25705   // .LBB5_4:
25706   //         retq
25707   //
25708   // because this custom-inserter would have generated:
25709   //
25710   //   A
25711   //   | \
25712   //   |  B
25713   //   | /
25714   //   C
25715   //   | \
25716   //   |  D
25717   //   | /
25718   //   E
25719   //
25720   // A: X = ...; Y = ...
25721   // B: empty
25722   // C: Z = PHI [X, A], [Y, B]
25723   // D: empty
25724   // E: PHI [X, C], [Z, D]
25725   //
25726   // If we lower both CMOVs in a single step, we can instead generate:
25727   //
25728   //   A
25729   //   | \
25730   //   |  C
25731   //   | /|
25732   //   |/ |
25733   //   |  |
25734   //   |  D
25735   //   | /
25736   //   E
25737   //
25738   // A: X = ...; Y = ...
25739   // D: empty
25740   // E: PHI [X, A], [X, C], [Y, D]
25741   //
25742   // Which, in our sitofp/fcmp example, gives us something like:
25743   //
25744   //         ucomiss %xmm1, %xmm0
25745   //         movss  <1.0f>, %xmm0
25746   //         jne     .LBB5_4
25747   //         jp      .LBB5_4
25748   //         xorps   %xmm0, %xmm0
25749   // .LBB5_4:
25750   //         retq
25751   //
25752   MachineInstr *CascadedCMOV = nullptr;
25753   MachineInstr *LastCMOV = &MI;
25754   X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25755   X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25756   MachineBasicBlock::iterator NextMIIt =
25757       std::next(MachineBasicBlock::iterator(MI));
25758
25759   // Check for case 1, where there are multiple CMOVs with the same condition
25760   // first.  Of the two cases of multiple CMOV lowerings, case 1 reduces the
25761   // number of jumps the most.
25762
25763   if (isCMOVPseudo(MI)) {
25764     // See if we have a string of CMOVS with the same condition.
25765     while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25766            (NextMIIt->getOperand(3).getImm() == CC ||
25767             NextMIIt->getOperand(3).getImm() == OppCC)) {
25768       LastCMOV = &*NextMIIt;
25769       ++NextMIIt;
25770     }
25771   }
25772
25773   // This checks for case 2, but only do this if we didn't already find
25774   // case 1, as indicated by LastCMOV == MI.
25775   if (LastCMOV == &MI && NextMIIt != BB->end() &&
25776       NextMIIt->getOpcode() == MI.getOpcode() &&
25777       NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25778       NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25779       NextMIIt->getOperand(1).isKill()) {
25780     CascadedCMOV = &*NextMIIt;
25781   }
25782
25783   MachineBasicBlock *jcc1MBB = nullptr;
25784
25785   // If we have a cascaded CMOV, we lower it to two successive branches to
25786   // the same block.  EFLAGS is used by both, so mark it as live in the second.
25787   if (CascadedCMOV) {
25788     jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25789     F->insert(It, jcc1MBB);
25790     jcc1MBB->addLiveIn(X86::EFLAGS);
25791   }
25792
25793   MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25794   MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25795   F->insert(It, copy0MBB);
25796   F->insert(It, sinkMBB);
25797
25798   // If the EFLAGS register isn't dead in the terminator, then claim that it's
25799   // live into the sink and copy blocks.
25800   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25801
25802   MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25803   if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25804       !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25805     copy0MBB->addLiveIn(X86::EFLAGS);
25806     sinkMBB->addLiveIn(X86::EFLAGS);
25807   }
25808
25809   // Transfer the remainder of BB and its successor edges to sinkMBB.
25810   sinkMBB->splice(sinkMBB->begin(), BB,
25811                   std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25812   sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25813
25814   // Add the true and fallthrough blocks as its successors.
25815   if (CascadedCMOV) {
25816     // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25817     BB->addSuccessor(jcc1MBB);
25818
25819     // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25820     // jump to the sinkMBB.
25821     jcc1MBB->addSuccessor(copy0MBB);
25822     jcc1MBB->addSuccessor(sinkMBB);
25823   } else {
25824     BB->addSuccessor(copy0MBB);
25825   }
25826
25827   // The true block target of the first (or only) branch is always sinkMBB.
25828   BB->addSuccessor(sinkMBB);
25829
25830   // Create the conditional branch instruction.
25831   unsigned Opc = X86::GetCondBranchFromCond(CC);
25832   BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25833
25834   if (CascadedCMOV) {
25835     unsigned Opc2 = X86::GetCondBranchFromCond(
25836         (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25837     BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25838   }
25839
25840   //  copy0MBB:
25841   //   %FalseValue = ...
25842   //   # fallthrough to sinkMBB
25843   copy0MBB->addSuccessor(sinkMBB);
25844
25845   //  sinkMBB:
25846   //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25847   //  ...
25848   MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25849   MachineBasicBlock::iterator MIItEnd =
25850     std::next(MachineBasicBlock::iterator(LastCMOV));
25851   MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25852   DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25853   MachineInstrBuilder MIB;
25854
25855   // As we are creating the PHIs, we have to be careful if there is more than
25856   // one.  Later CMOVs may reference the results of earlier CMOVs, but later
25857   // PHIs have to reference the individual true/false inputs from earlier PHIs.
25858   // That also means that PHI construction must work forward from earlier to
25859   // later, and that the code must maintain a mapping from earlier PHI's
25860   // destination registers, and the registers that went into the PHI.
25861
25862   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25863     unsigned DestReg = MIIt->getOperand(0).getReg();
25864     unsigned Op1Reg = MIIt->getOperand(1).getReg();
25865     unsigned Op2Reg = MIIt->getOperand(2).getReg();
25866
25867     // If this CMOV we are generating is the opposite condition from
25868     // the jump we generated, then we have to swap the operands for the
25869     // PHI that is going to be generated.
25870     if (MIIt->getOperand(3).getImm() == OppCC)
25871         std::swap(Op1Reg, Op2Reg);
25872
25873     if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25874       Op1Reg = RegRewriteTable[Op1Reg].first;
25875
25876     if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25877       Op2Reg = RegRewriteTable[Op2Reg].second;
25878
25879     MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25880                   TII->get(X86::PHI), DestReg)
25881           .addReg(Op1Reg).addMBB(copy0MBB)
25882           .addReg(Op2Reg).addMBB(thisMBB);
25883
25884     // Add this PHI to the rewrite table.
25885     RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25886   }
25887
25888   // If we have a cascaded CMOV, the second Jcc provides the same incoming
25889   // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25890   if (CascadedCMOV) {
25891     MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25892     // Copy the PHI result to the register defined by the second CMOV.
25893     BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25894             DL, TII->get(TargetOpcode::COPY),
25895             CascadedCMOV->getOperand(0).getReg())
25896         .addReg(MI.getOperand(0).getReg());
25897     CascadedCMOV->eraseFromParent();
25898   }
25899
25900   // Now remove the CMOV(s).
25901   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25902     (MIIt++)->eraseFromParent();
25903
25904   return sinkMBB;
25905 }
25906
25907 MachineBasicBlock *
25908 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25909                                        MachineBasicBlock *BB) const {
25910   // Combine the following atomic floating-point modification pattern:
25911   //   a.store(reg OP a.load(acquire), release)
25912   // Transform them into:
25913   //   OPss (%gpr), %xmm
25914   //   movss %xmm, (%gpr)
25915   // Or sd equivalent for 64-bit operations.
25916   unsigned MOp, FOp;
25917   switch (MI.getOpcode()) {
25918   default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25919   case X86::RELEASE_FADD32mr:
25920     FOp = X86::ADDSSrm;
25921     MOp = X86::MOVSSmr;
25922     break;
25923   case X86::RELEASE_FADD64mr:
25924     FOp = X86::ADDSDrm;
25925     MOp = X86::MOVSDmr;
25926     break;
25927   }
25928   const X86InstrInfo *TII = Subtarget.getInstrInfo();
25929   DebugLoc DL = MI.getDebugLoc();
25930   MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25931   unsigned ValOpIdx = X86::AddrNumOperands;
25932   unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25933   MachineInstrBuilder MIB =
25934       BuildMI(*BB, MI, DL, TII->get(FOp),
25935               MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25936           .addReg(VSrc);
25937   for (int i = 0; i < X86::AddrNumOperands; ++i) {
25938     MachineOperand &Operand = MI.getOperand(i);
25939     // Clear any kill flags on register operands as we'll create a second
25940     // instruction using the same address operands.
25941     if (Operand.isReg())
25942       Operand.setIsKill(false);
25943     MIB.add(Operand);
25944   }
25945   MachineInstr *FOpMI = MIB;
25946   MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25947   for (int i = 0; i < X86::AddrNumOperands; ++i)
25948     MIB.add(MI.getOperand(i));
25949   MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25950   MI.eraseFromParent(); // The pseudo instruction is gone now.
25951   return BB;
25952 }
25953
25954 MachineBasicBlock *
25955 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25956                                         MachineBasicBlock *BB) const {
25957   MachineFunction *MF = BB->getParent();
25958   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25959   DebugLoc DL = MI.getDebugLoc();
25960   const BasicBlock *LLVM_BB = BB->getBasicBlock();
25961
25962   assert(MF->shouldSplitStack());
25963
25964   const bool Is64Bit = Subtarget.is64Bit();
25965   const bool IsLP64 = Subtarget.isTarget64BitLP64();
25966
25967   const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25968   const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25969
25970   // BB:
25971   //  ... [Till the alloca]
25972   // If stacklet is not large enough, jump to mallocMBB
25973   //
25974   // bumpMBB:
25975   //  Allocate by subtracting from RSP
25976   //  Jump to continueMBB
25977   //
25978   // mallocMBB:
25979   //  Allocate by call to runtime
25980   //
25981   // continueMBB:
25982   //  ...
25983   //  [rest of original BB]
25984   //
25985
25986   MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25987   MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25988   MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25989
25990   MachineRegisterInfo &MRI = MF->getRegInfo();
25991   const TargetRegisterClass *AddrRegClass =
25992       getRegClassFor(getPointerTy(MF->getDataLayout()));
25993
25994   unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25995            bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25996            tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25997            SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25998            sizeVReg = MI.getOperand(1).getReg(),
25999            physSPReg =
26000                IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26001
26002   MachineFunction::iterator MBBIter = ++BB->getIterator();
26003
26004   MF->insert(MBBIter, bumpMBB);
26005   MF->insert(MBBIter, mallocMBB);
26006   MF->insert(MBBIter, continueMBB);
26007
26008   continueMBB->splice(continueMBB->begin(), BB,
26009                       std::next(MachineBasicBlock::iterator(MI)), BB->end());
26010   continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26011
26012   // Add code to the main basic block to check if the stack limit has been hit,
26013   // and if so, jump to mallocMBB otherwise to bumpMBB.
26014   BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26015   BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26016     .addReg(tmpSPVReg).addReg(sizeVReg);
26017   BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26018     .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26019     .addReg(SPLimitVReg);
26020   BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26021
26022   // bumpMBB simply decreases the stack pointer, since we know the current
26023   // stacklet has enough space.
26024   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26025     .addReg(SPLimitVReg);
26026   BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26027     .addReg(SPLimitVReg);
26028   BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26029
26030   // Calls into a routine in libgcc to allocate more space from the heap.
26031   const uint32_t *RegMask =
26032       Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26033   if (IsLP64) {
26034     BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26035       .addReg(sizeVReg);
26036     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26037       .addExternalSymbol("__morestack_allocate_stack_space")
26038       .addRegMask(RegMask)
26039       .addReg(X86::RDI, RegState::Implicit)
26040       .addReg(X86::RAX, RegState::ImplicitDefine);
26041   } else if (Is64Bit) {
26042     BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26043       .addReg(sizeVReg);
26044     BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26045       .addExternalSymbol("__morestack_allocate_stack_space")
26046       .addRegMask(RegMask)
26047       .addReg(X86::EDI, RegState::Implicit)
26048       .addReg(X86::EAX, RegState::ImplicitDefine);
26049   } else {
26050     BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26051       .addImm(12);
26052     BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26053     BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26054       .addExternalSymbol("__morestack_allocate_stack_space")
26055       .addRegMask(RegMask)
26056       .addReg(X86::EAX, RegState::ImplicitDefine);
26057   }
26058
26059   if (!Is64Bit)
26060     BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26061       .addImm(16);
26062
26063   BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26064     .addReg(IsLP64 ? X86::RAX : X86::EAX);
26065   BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26066
26067   // Set up the CFG correctly.
26068   BB->addSuccessor(bumpMBB);
26069   BB->addSuccessor(mallocMBB);
26070   mallocMBB->addSuccessor(continueMBB);
26071   bumpMBB->addSuccessor(continueMBB);
26072
26073   // Take care of the PHI nodes.
26074   BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26075           MI.getOperand(0).getReg())
26076       .addReg(mallocPtrVReg)
26077       .addMBB(mallocMBB)
26078       .addReg(bumpSPPtrVReg)
26079       .addMBB(bumpMBB);
26080
26081   // Delete the original pseudo instruction.
26082   MI.eraseFromParent();
26083
26084   // And we're done.
26085   return continueMBB;
26086 }
26087
26088 MachineBasicBlock *
26089 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26090                                        MachineBasicBlock *BB) const {
26091   MachineFunction *MF = BB->getParent();
26092   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26093   MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26094   DebugLoc DL = MI.getDebugLoc();
26095
26096   assert(!isAsynchronousEHPersonality(
26097              classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26098          "SEH does not use catchret!");
26099
26100   // Only 32-bit EH needs to worry about manually restoring stack pointers.
26101   if (!Subtarget.is32Bit())
26102     return BB;
26103
26104   // C++ EH creates a new target block to hold the restore code, and wires up
26105   // the new block to the return destination with a normal JMP_4.
26106   MachineBasicBlock *RestoreMBB =
26107       MF->CreateMachineBasicBlock(BB->getBasicBlock());
26108   assert(BB->succ_size() == 1);
26109   MF->insert(std::next(BB->getIterator()), RestoreMBB);
26110   RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26111   BB->addSuccessor(RestoreMBB);
26112   MI.getOperand(0).setMBB(RestoreMBB);
26113
26114   auto RestoreMBBI = RestoreMBB->begin();
26115   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26116   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26117   return BB;
26118 }
26119
26120 MachineBasicBlock *
26121 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26122                                        MachineBasicBlock *BB) const {
26123   MachineFunction *MF = BB->getParent();
26124   const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26125   bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26126   // Only 32-bit SEH requires special handling for catchpad.
26127   if (IsSEH && Subtarget.is32Bit()) {
26128     const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26129     DebugLoc DL = MI.getDebugLoc();
26130     BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26131   }
26132   MI.eraseFromParent();
26133   return BB;
26134 }
26135
26136 MachineBasicBlock *
26137 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26138                                       MachineBasicBlock *BB) const {
26139   // So, here we replace TLSADDR with the sequence:
26140   // adjust_stackdown -> TLSADDR -> adjust_stackup.
26141   // We need this because TLSADDR is lowered into calls
26142   // inside MC, therefore without the two markers shrink-wrapping
26143   // may push the prologue/epilogue pass them.
26144   const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26145   DebugLoc DL = MI.getDebugLoc();
26146   MachineFunction &MF = *BB->getParent();
26147
26148   // Emit CALLSEQ_START right before the instruction.
26149   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26150   MachineInstrBuilder CallseqStart =
26151     BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26152   BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26153
26154   // Emit CALLSEQ_END right after the instruction.
26155   // We don't call erase from parent because we want to keep the
26156   // original instruction around.
26157   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26158   MachineInstrBuilder CallseqEnd =
26159     BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26160   BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26161
26162   return BB;
26163 }
26164
26165 MachineBasicBlock *
26166 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26167                                       MachineBasicBlock *BB) const {
26168   // This is pretty easy.  We're taking the value that we received from
26169   // our load from the relocation, sticking it in either RDI (x86-64)
26170   // or EAX and doing an indirect call.  The return value will then
26171   // be in the normal return register.
26172   MachineFunction *F = BB->getParent();
26173   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26174   DebugLoc DL = MI.getDebugLoc();
26175
26176   assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26177   assert(MI.getOperand(3).isGlobal() && "This should be a global");
26178
26179   // Get a register mask for the lowered call.
26180   // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26181   // proper register mask.
26182   const uint32_t *RegMask =
26183       Subtarget.is64Bit() ?
26184       Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26185       Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26186   if (Subtarget.is64Bit()) {
26187     MachineInstrBuilder MIB =
26188         BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26189             .addReg(X86::RIP)
26190             .addImm(0)
26191             .addReg(0)
26192             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26193                               MI.getOperand(3).getTargetFlags())
26194             .addReg(0);
26195     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26196     addDirectMem(MIB, X86::RDI);
26197     MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26198   } else if (!isPositionIndependent()) {
26199     MachineInstrBuilder MIB =
26200         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26201             .addReg(0)
26202             .addImm(0)
26203             .addReg(0)
26204             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26205                               MI.getOperand(3).getTargetFlags())
26206             .addReg(0);
26207     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26208     addDirectMem(MIB, X86::EAX);
26209     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26210   } else {
26211     MachineInstrBuilder MIB =
26212         BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26213             .addReg(TII->getGlobalBaseReg(F))
26214             .addImm(0)
26215             .addReg(0)
26216             .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26217                               MI.getOperand(3).getTargetFlags())
26218             .addReg(0);
26219     MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26220     addDirectMem(MIB, X86::EAX);
26221     MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26222   }
26223
26224   MI.eraseFromParent(); // The pseudo instruction is gone now.
26225   return BB;
26226 }
26227
26228 MachineBasicBlock *
26229 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26230                                     MachineBasicBlock *MBB) const {
26231   DebugLoc DL = MI.getDebugLoc();
26232   MachineFunction *MF = MBB->getParent();
26233   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26234   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26235   MachineRegisterInfo &MRI = MF->getRegInfo();
26236
26237   const BasicBlock *BB = MBB->getBasicBlock();
26238   MachineFunction::iterator I = ++MBB->getIterator();
26239
26240   // Memory Reference
26241   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26242   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26243
26244   unsigned DstReg;
26245   unsigned MemOpndSlot = 0;
26246
26247   unsigned CurOp = 0;
26248
26249   DstReg = MI.getOperand(CurOp++).getReg();
26250   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26251   assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26252   (void)TRI;
26253   unsigned mainDstReg = MRI.createVirtualRegister(RC);
26254   unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26255
26256   MemOpndSlot = CurOp;
26257
26258   MVT PVT = getPointerTy(MF->getDataLayout());
26259   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26260          "Invalid Pointer Size!");
26261
26262   // For v = setjmp(buf), we generate
26263   //
26264   // thisMBB:
26265   //  buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26266   //  SjLjSetup restoreMBB
26267   //
26268   // mainMBB:
26269   //  v_main = 0
26270   //
26271   // sinkMBB:
26272   //  v = phi(main, restore)
26273   //
26274   // restoreMBB:
26275   //  if base pointer being used, load it from frame
26276   //  v_restore = 1
26277
26278   MachineBasicBlock *thisMBB = MBB;
26279   MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26280   MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26281   MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26282   MF->insert(I, mainMBB);
26283   MF->insert(I, sinkMBB);
26284   MF->push_back(restoreMBB);
26285   restoreMBB->setHasAddressTaken();
26286
26287   MachineInstrBuilder MIB;
26288
26289   // Transfer the remainder of BB and its successor edges to sinkMBB.
26290   sinkMBB->splice(sinkMBB->begin(), MBB,
26291                   std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26292   sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26293
26294   // thisMBB:
26295   unsigned PtrStoreOpc = 0;
26296   unsigned LabelReg = 0;
26297   const int64_t LabelOffset = 1 * PVT.getStoreSize();
26298   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26299                      !isPositionIndependent();
26300
26301   // Prepare IP either in reg or imm.
26302   if (!UseImmLabel) {
26303     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26304     const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26305     LabelReg = MRI.createVirtualRegister(PtrRC);
26306     if (Subtarget.is64Bit()) {
26307       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26308               .addReg(X86::RIP)
26309               .addImm(0)
26310               .addReg(0)
26311               .addMBB(restoreMBB)
26312               .addReg(0);
26313     } else {
26314       const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26315       MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26316               .addReg(XII->getGlobalBaseReg(MF))
26317               .addImm(0)
26318               .addReg(0)
26319               .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26320               .addReg(0);
26321     }
26322   } else
26323     PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26324   // Store IP
26325   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26326   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26327     if (i == X86::AddrDisp)
26328       MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26329     else
26330       MIB.add(MI.getOperand(MemOpndSlot + i));
26331   }
26332   if (!UseImmLabel)
26333     MIB.addReg(LabelReg);
26334   else
26335     MIB.addMBB(restoreMBB);
26336   MIB.setMemRefs(MMOBegin, MMOEnd);
26337   // Setup
26338   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26339           .addMBB(restoreMBB);
26340
26341   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26342   MIB.addRegMask(RegInfo->getNoPreservedMask());
26343   thisMBB->addSuccessor(mainMBB);
26344   thisMBB->addSuccessor(restoreMBB);
26345
26346   // mainMBB:
26347   //  EAX = 0
26348   BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26349   mainMBB->addSuccessor(sinkMBB);
26350
26351   // sinkMBB:
26352   BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26353           TII->get(X86::PHI), DstReg)
26354     .addReg(mainDstReg).addMBB(mainMBB)
26355     .addReg(restoreDstReg).addMBB(restoreMBB);
26356
26357   // restoreMBB:
26358   if (RegInfo->hasBasePointer(*MF)) {
26359     const bool Uses64BitFramePtr =
26360         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26361     X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26362     X86FI->setRestoreBasePointer(MF);
26363     unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26364     unsigned BasePtr = RegInfo->getBaseRegister();
26365     unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26366     addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26367                  FramePtr, true, X86FI->getRestoreBasePointerOffset())
26368       .setMIFlag(MachineInstr::FrameSetup);
26369   }
26370   BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26371   BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26372   restoreMBB->addSuccessor(sinkMBB);
26373
26374   MI.eraseFromParent();
26375   return sinkMBB;
26376 }
26377
26378 MachineBasicBlock *
26379 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26380                                      MachineBasicBlock *MBB) const {
26381   DebugLoc DL = MI.getDebugLoc();
26382   MachineFunction *MF = MBB->getParent();
26383   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26384   MachineRegisterInfo &MRI = MF->getRegInfo();
26385
26386   // Memory Reference
26387   MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26388   MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26389
26390   MVT PVT = getPointerTy(MF->getDataLayout());
26391   assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26392          "Invalid Pointer Size!");
26393
26394   const TargetRegisterClass *RC =
26395     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26396   unsigned Tmp = MRI.createVirtualRegister(RC);
26397   // Since FP is only updated here but NOT referenced, it's treated as GPR.
26398   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26399   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26400   unsigned SP = RegInfo->getStackRegister();
26401
26402   MachineInstrBuilder MIB;
26403
26404   const int64_t LabelOffset = 1 * PVT.getStoreSize();
26405   const int64_t SPOffset = 2 * PVT.getStoreSize();
26406
26407   unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26408   unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26409
26410   // Reload FP
26411   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26412   for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26413     MIB.add(MI.getOperand(i));
26414   MIB.setMemRefs(MMOBegin, MMOEnd);
26415   // Reload IP
26416   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26417   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26418     if (i == X86::AddrDisp)
26419       MIB.addDisp(MI.getOperand(i), LabelOffset);
26420     else
26421       MIB.add(MI.getOperand(i));
26422   }
26423   MIB.setMemRefs(MMOBegin, MMOEnd);
26424   // Reload SP
26425   MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26426   for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26427     if (i == X86::AddrDisp)
26428       MIB.addDisp(MI.getOperand(i), SPOffset);
26429     else
26430       MIB.add(MI.getOperand(i));
26431   }
26432   MIB.setMemRefs(MMOBegin, MMOEnd);
26433   // Jump
26434   BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26435
26436   MI.eraseFromParent();
26437   return MBB;
26438 }
26439
26440 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26441                                                MachineBasicBlock *MBB,
26442                                                MachineBasicBlock *DispatchBB,
26443                                                int FI) const {
26444   DebugLoc DL = MI.getDebugLoc();
26445   MachineFunction *MF = MBB->getParent();
26446   MachineRegisterInfo *MRI = &MF->getRegInfo();
26447   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26448
26449   MVT PVT = getPointerTy(MF->getDataLayout());
26450   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26451
26452   unsigned Op = 0;
26453   unsigned VR = 0;
26454
26455   bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26456                      !isPositionIndependent();
26457
26458   if (UseImmLabel) {
26459     Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26460   } else {
26461     const TargetRegisterClass *TRC =
26462         (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26463     VR = MRI->createVirtualRegister(TRC);
26464     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26465
26466     if (Subtarget.is64Bit())
26467       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26468           .addReg(X86::RIP)
26469           .addImm(1)
26470           .addReg(0)
26471           .addMBB(DispatchBB)
26472           .addReg(0);
26473     else
26474       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26475           .addReg(0) /* TII->getGlobalBaseReg(MF) */
26476           .addImm(1)
26477           .addReg(0)
26478           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26479           .addReg(0);
26480   }
26481
26482   MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26483   addFrameReference(MIB, FI, 36);
26484   if (UseImmLabel)
26485     MIB.addMBB(DispatchBB);
26486   else
26487     MIB.addReg(VR);
26488 }
26489
26490 MachineBasicBlock *
26491 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26492                                          MachineBasicBlock *BB) const {
26493   DebugLoc DL = MI.getDebugLoc();
26494   MachineFunction *MF = BB->getParent();
26495   MachineFrameInfo &MFI = MF->getFrameInfo();
26496   MachineRegisterInfo *MRI = &MF->getRegInfo();
26497   const X86InstrInfo *TII = Subtarget.getInstrInfo();
26498   int FI = MFI.getFunctionContextIndex();
26499
26500   // Get a mapping of the call site numbers to all of the landing pads they're
26501   // associated with.
26502   DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26503   unsigned MaxCSNum = 0;
26504   for (auto &MBB : *MF) {
26505     if (!MBB.isEHPad())
26506       continue;
26507
26508     MCSymbol *Sym = nullptr;
26509     for (const auto &MI : MBB) {
26510       if (MI.isDebugValue())
26511         continue;
26512
26513       assert(MI.isEHLabel() && "expected EH_LABEL");
26514       Sym = MI.getOperand(0).getMCSymbol();
26515       break;
26516     }
26517
26518     if (!MF->hasCallSiteLandingPad(Sym))
26519       continue;
26520
26521     for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26522       CallSiteNumToLPad[CSI].push_back(&MBB);
26523       MaxCSNum = std::max(MaxCSNum, CSI);
26524     }
26525   }
26526
26527   // Get an ordered list of the machine basic blocks for the jump table.
26528   std::vector<MachineBasicBlock *> LPadList;
26529   SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26530   LPadList.reserve(CallSiteNumToLPad.size());
26531
26532   for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26533     for (auto &LP : CallSiteNumToLPad[CSI]) {
26534       LPadList.push_back(LP);
26535       InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26536     }
26537   }
26538
26539   assert(!LPadList.empty() &&
26540          "No landing pad destinations for the dispatch jump table!");
26541
26542   // Create the MBBs for the dispatch code.
26543
26544   // Shove the dispatch's address into the return slot in the function context.
26545   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26546   DispatchBB->setIsEHPad(true);
26547
26548   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26549   BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26550   DispatchBB->addSuccessor(TrapBB);
26551
26552   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26553   DispatchBB->addSuccessor(DispContBB);
26554
26555   // Insert MBBs.
26556   MF->push_back(DispatchBB);
26557   MF->push_back(DispContBB);
26558   MF->push_back(TrapBB);
26559
26560   // Insert code into the entry block that creates and registers the function
26561   // context.
26562   SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26563
26564   // Create the jump table and associated information
26565   MachineJumpTableInfo *JTI =
26566       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26567   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26568
26569   const X86RegisterInfo &RI = TII->getRegisterInfo();
26570   // Add a register mask with no preserved registers.  This results in all
26571   // registers being marked as clobbered.
26572   if (RI.hasBasePointer(*MF)) {
26573     const bool FPIs64Bit =
26574         Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26575     X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26576     MFI->setRestoreBasePointer(MF);
26577
26578     unsigned FP = RI.getFrameRegister(*MF);
26579     unsigned BP = RI.getBaseRegister();
26580     unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26581     addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26582                  MFI->getRestoreBasePointerOffset())
26583         .addRegMask(RI.getNoPreservedMask());
26584   } else {
26585     BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26586         .addRegMask(RI.getNoPreservedMask());
26587   }
26588
26589   unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26590   addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26591                     4);
26592   BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26593       .addReg(IReg)
26594       .addImm(LPadList.size());
26595   BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26596
26597   unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26598   BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26599       .addReg(IReg)
26600       .addImm(1);
26601   BuildMI(DispContBB, DL,
26602           TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26603       .addReg(0)
26604       .addImm(Subtarget.is64Bit() ? 8 : 4)
26605       .addReg(JReg)
26606       .addJumpTableIndex(MJTI)
26607       .addReg(0);
26608
26609   // Add the jump table entries as successors to the MBB.
26610   SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26611   for (auto &LP : LPadList)
26612     if (SeenMBBs.insert(LP).second)
26613       DispContBB->addSuccessor(LP);
26614
26615   // N.B. the order the invoke BBs are processed in doesn't matter here.
26616   SmallVector<MachineBasicBlock *, 64> MBBLPads;
26617   const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26618   for (MachineBasicBlock *MBB : InvokeBBs) {
26619     // Remove the landing pad successor from the invoke block and replace it
26620     // with the new dispatch block.
26621     // Keep a copy of Successors since it's modified inside the loop.
26622     SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26623                                                    MBB->succ_rend());
26624     // FIXME: Avoid quadratic complexity.
26625     for (auto MBBS : Successors) {
26626       if (MBBS->isEHPad()) {
26627         MBB->removeSuccessor(MBBS);
26628         MBBLPads.push_back(MBBS);
26629       }
26630     }
26631
26632     MBB->addSuccessor(DispatchBB);
26633
26634     // Find the invoke call and mark all of the callee-saved registers as
26635     // 'implicit defined' so that they're spilled.  This prevents code from
26636     // moving instructions to before the EH block, where they will never be
26637     // executed.
26638     for (auto &II : reverse(*MBB)) {
26639       if (!II.isCall())
26640         continue;
26641
26642       DenseMap<unsigned, bool> DefRegs;
26643       for (auto &MOp : II.operands())
26644         if (MOp.isReg())
26645           DefRegs[MOp.getReg()] = true;
26646
26647       MachineInstrBuilder MIB(*MF, &II);
26648       for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26649         unsigned Reg = SavedRegs[RI];
26650         if (!DefRegs[Reg])
26651           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26652       }
26653
26654       break;
26655     }
26656   }
26657
26658   // Mark all former landing pads as non-landing pads.  The dispatch is the only
26659   // landing pad now.
26660   for (auto &LP : MBBLPads)
26661     LP->setIsEHPad(false);
26662
26663   // The instruction is gone now.
26664   MI.eraseFromParent();
26665   return BB;
26666 }
26667
26668 MachineBasicBlock *
26669 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26670                                                MachineBasicBlock *BB) const {
26671   MachineFunction *MF = BB->getParent();
26672   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26673   DebugLoc DL = MI.getDebugLoc();
26674
26675   switch (MI.getOpcode()) {
26676   default: llvm_unreachable("Unexpected instr type to insert");
26677   case X86::TAILJMPd64:
26678   case X86::TAILJMPr64:
26679   case X86::TAILJMPm64:
26680   case X86::TAILJMPr64_REX:
26681   case X86::TAILJMPm64_REX:
26682     llvm_unreachable("TAILJMP64 would not be touched here.");
26683   case X86::TCRETURNdi64:
26684   case X86::TCRETURNri64:
26685   case X86::TCRETURNmi64:
26686     return BB;
26687   case X86::TLS_addr32:
26688   case X86::TLS_addr64:
26689   case X86::TLS_base_addr32:
26690   case X86::TLS_base_addr64:
26691     return EmitLoweredTLSAddr(MI, BB);
26692   case X86::CATCHRET:
26693     return EmitLoweredCatchRet(MI, BB);
26694   case X86::CATCHPAD:
26695     return EmitLoweredCatchPad(MI, BB);
26696   case X86::SEG_ALLOCA_32:
26697   case X86::SEG_ALLOCA_64:
26698     return EmitLoweredSegAlloca(MI, BB);
26699   case X86::TLSCall_32:
26700   case X86::TLSCall_64:
26701     return EmitLoweredTLSCall(MI, BB);
26702   case X86::CMOV_FR32:
26703   case X86::CMOV_FR64:
26704   case X86::CMOV_FR128:
26705   case X86::CMOV_GR8:
26706   case X86::CMOV_GR16:
26707   case X86::CMOV_GR32:
26708   case X86::CMOV_RFP32:
26709   case X86::CMOV_RFP64:
26710   case X86::CMOV_RFP80:
26711   case X86::CMOV_V2F64:
26712   case X86::CMOV_V2I64:
26713   case X86::CMOV_V4F32:
26714   case X86::CMOV_V4F64:
26715   case X86::CMOV_V4I64:
26716   case X86::CMOV_V16F32:
26717   case X86::CMOV_V8F32:
26718   case X86::CMOV_V8F64:
26719   case X86::CMOV_V8I64:
26720   case X86::CMOV_V8I1:
26721   case X86::CMOV_V16I1:
26722   case X86::CMOV_V32I1:
26723   case X86::CMOV_V64I1:
26724     return EmitLoweredSelect(MI, BB);
26725
26726   case X86::RDFLAGS32:
26727   case X86::RDFLAGS64: {
26728     unsigned PushF =
26729         MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26730     unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26731     MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26732     // Permit reads of the FLAGS register without it being defined.
26733     // This intrinsic exists to read external processor state in flags, such as
26734     // the trap flag, interrupt flag, and direction flag, none of which are
26735     // modeled by the backend.
26736     Push->getOperand(2).setIsUndef();
26737     BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26738
26739     MI.eraseFromParent(); // The pseudo is gone now.
26740     return BB;
26741   }
26742
26743   case X86::WRFLAGS32:
26744   case X86::WRFLAGS64: {
26745     unsigned Push =
26746         MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26747     unsigned PopF =
26748         MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26749     BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26750     BuildMI(*BB, MI, DL, TII->get(PopF));
26751
26752     MI.eraseFromParent(); // The pseudo is gone now.
26753     return BB;
26754   }
26755
26756   case X86::RELEASE_FADD32mr:
26757   case X86::RELEASE_FADD64mr:
26758     return EmitLoweredAtomicFP(MI, BB);
26759
26760   case X86::FP32_TO_INT16_IN_MEM:
26761   case X86::FP32_TO_INT32_IN_MEM:
26762   case X86::FP32_TO_INT64_IN_MEM:
26763   case X86::FP64_TO_INT16_IN_MEM:
26764   case X86::FP64_TO_INT32_IN_MEM:
26765   case X86::FP64_TO_INT64_IN_MEM:
26766   case X86::FP80_TO_INT16_IN_MEM:
26767   case X86::FP80_TO_INT32_IN_MEM:
26768   case X86::FP80_TO_INT64_IN_MEM: {
26769     // Change the floating point control register to use "round towards zero"
26770     // mode when truncating to an integer value.
26771     int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26772     addFrameReference(BuildMI(*BB, MI, DL,
26773                               TII->get(X86::FNSTCW16m)), CWFrameIdx);
26774
26775     // Load the old value of the high byte of the control word...
26776     unsigned OldCW =
26777       MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26778     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26779                       CWFrameIdx);
26780
26781     // Set the high part to be round to zero...
26782     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26783       .addImm(0xC7F);
26784
26785     // Reload the modified control word now...
26786     addFrameReference(BuildMI(*BB, MI, DL,
26787                               TII->get(X86::FLDCW16m)), CWFrameIdx);
26788
26789     // Restore the memory image of control word to original value
26790     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26791       .addReg(OldCW);
26792
26793     // Get the X86 opcode to use.
26794     unsigned Opc;
26795     switch (MI.getOpcode()) {
26796     default: llvm_unreachable("illegal opcode!");
26797     case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26798     case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26799     case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26800     case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26801     case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26802     case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26803     case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26804     case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26805     case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26806     }
26807
26808     X86AddressMode AM = getAddressFromInstr(&MI, 0);
26809     addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26810         .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26811
26812     // Reload the original control word now.
26813     addFrameReference(BuildMI(*BB, MI, DL,
26814                               TII->get(X86::FLDCW16m)), CWFrameIdx);
26815
26816     MI.eraseFromParent(); // The pseudo instruction is gone now.
26817     return BB;
26818   }
26819     // String/text processing lowering.
26820   case X86::PCMPISTRM128REG:
26821   case X86::VPCMPISTRM128REG:
26822   case X86::PCMPISTRM128MEM:
26823   case X86::VPCMPISTRM128MEM:
26824   case X86::PCMPESTRM128REG:
26825   case X86::VPCMPESTRM128REG:
26826   case X86::PCMPESTRM128MEM:
26827   case X86::VPCMPESTRM128MEM:
26828     assert(Subtarget.hasSSE42() &&
26829            "Target must have SSE4.2 or AVX features enabled");
26830     return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26831
26832   // String/text processing lowering.
26833   case X86::PCMPISTRIREG:
26834   case X86::VPCMPISTRIREG:
26835   case X86::PCMPISTRIMEM:
26836   case X86::VPCMPISTRIMEM:
26837   case X86::PCMPESTRIREG:
26838   case X86::VPCMPESTRIREG:
26839   case X86::PCMPESTRIMEM:
26840   case X86::VPCMPESTRIMEM:
26841     assert(Subtarget.hasSSE42() &&
26842            "Target must have SSE4.2 or AVX features enabled");
26843     return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26844
26845   // Thread synchronization.
26846   case X86::MONITOR:
26847     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26848   case X86::MONITORX:
26849     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26850
26851   // Cache line zero
26852   case X86::CLZERO:
26853     return emitClzero(&MI, BB, Subtarget);
26854
26855   // PKU feature
26856   case X86::WRPKRU:
26857     return emitWRPKRU(MI, BB, Subtarget);
26858   case X86::RDPKRU:
26859     return emitRDPKRU(MI, BB, Subtarget);
26860   // xbegin
26861   case X86::XBEGIN:
26862     return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26863
26864   case X86::VASTART_SAVE_XMM_REGS:
26865     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26866
26867   case X86::VAARG_64:
26868     return EmitVAARG64WithCustomInserter(MI, BB);
26869
26870   case X86::EH_SjLj_SetJmp32:
26871   case X86::EH_SjLj_SetJmp64:
26872     return emitEHSjLjSetJmp(MI, BB);
26873
26874   case X86::EH_SjLj_LongJmp32:
26875   case X86::EH_SjLj_LongJmp64:
26876     return emitEHSjLjLongJmp(MI, BB);
26877
26878   case X86::Int_eh_sjlj_setup_dispatch:
26879     return EmitSjLjDispatchBlock(MI, BB);
26880
26881   case TargetOpcode::STATEPOINT:
26882     // As an implementation detail, STATEPOINT shares the STACKMAP format at
26883     // this point in the process.  We diverge later.
26884     return emitPatchPoint(MI, BB);
26885
26886   case TargetOpcode::STACKMAP:
26887   case TargetOpcode::PATCHPOINT:
26888     return emitPatchPoint(MI, BB);
26889
26890   case TargetOpcode::PATCHABLE_EVENT_CALL:
26891     // Do nothing here, handle in xray instrumentation pass.
26892     return BB;
26893
26894   case X86::LCMPXCHG8B: {
26895     const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26896     // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26897     // requires a memory operand. If it happens that current architecture is
26898     // i686 and for current function we need a base pointer
26899     // - which is ESI for i686 - register allocator would not be able to
26900     // allocate registers for an address in form of X(%reg, %reg, Y)
26901     // - there never would be enough unreserved registers during regalloc
26902     // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26903     // We are giving a hand to register allocator by precomputing the address in
26904     // a new vreg using LEA.
26905
26906     // If it is not i686 or there is no base pointer - nothing to do here.
26907     if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26908       return BB;
26909
26910     // Even though this code does not necessarily needs the base pointer to
26911     // be ESI, we check for that. The reason: if this assert fails, there are
26912     // some changes happened in the compiler base pointer handling, which most
26913     // probably have to be addressed somehow here.
26914     assert(TRI->getBaseRegister() == X86::ESI &&
26915            "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26916            "base pointer in mind");
26917
26918     MachineRegisterInfo &MRI = MF->getRegInfo();
26919     MVT SPTy = getPointerTy(MF->getDataLayout());
26920     const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26921     unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26922
26923     X86AddressMode AM = getAddressFromInstr(&MI, 0);
26924     // Regalloc does not need any help when the memory operand of CMPXCHG8B
26925     // does not use index register.
26926     if (AM.IndexReg == X86::NoRegister)
26927       return BB;
26928
26929     // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26930     // four operand definitions that are E[ABCD] registers. We skip them and
26931     // then insert the LEA.
26932     MachineBasicBlock::iterator MBBI(MI);
26933     while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26934            MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26935       --MBBI;
26936     addFullAddress(
26937         BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26938
26939     setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26940
26941     return BB;
26942   }
26943   case X86::LCMPXCHG16B:
26944     return BB;
26945   case X86::LCMPXCHG8B_SAVE_EBX:
26946   case X86::LCMPXCHG16B_SAVE_RBX: {
26947     unsigned BasePtr =
26948         MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26949     if (!BB->isLiveIn(BasePtr))
26950       BB->addLiveIn(BasePtr);
26951     return BB;
26952   }
26953   }
26954 }
26955
26956 //===----------------------------------------------------------------------===//
26957 //                           X86 Optimization Hooks
26958 //===----------------------------------------------------------------------===//
26959
26960 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26961                                                       KnownBits &Known,
26962                                                       const APInt &DemandedElts,
26963                                                       const SelectionDAG &DAG,
26964                                                       unsigned Depth) const {
26965   unsigned BitWidth = Known.getBitWidth();
26966   unsigned Opc = Op.getOpcode();
26967   EVT VT = Op.getValueType();
26968   assert((Opc >= ISD::BUILTIN_OP_END ||
26969           Opc == ISD::INTRINSIC_WO_CHAIN ||
26970           Opc == ISD::INTRINSIC_W_CHAIN ||
26971           Opc == ISD::INTRINSIC_VOID) &&
26972          "Should use MaskedValueIsZero if you don't know whether Op"
26973          " is a target node!");
26974
26975   Known.resetAll();
26976   switch (Opc) {
26977   default: break;
26978   case X86ISD::ADD:
26979   case X86ISD::SUB:
26980   case X86ISD::ADC:
26981   case X86ISD::SBB:
26982   case X86ISD::SMUL:
26983   case X86ISD::UMUL:
26984   case X86ISD::INC:
26985   case X86ISD::DEC:
26986   case X86ISD::OR:
26987   case X86ISD::XOR:
26988   case X86ISD::AND:
26989     // These nodes' second result is a boolean.
26990     if (Op.getResNo() == 0)
26991       break;
26992     LLVM_FALLTHROUGH;
26993   case X86ISD::SETCC:
26994     Known.Zero.setBitsFrom(1);
26995     break;
26996   case X86ISD::MOVMSK: {
26997     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26998     Known.Zero.setBitsFrom(NumLoBits);
26999     break;
27000   }
27001   case X86ISD::VSHLI:
27002   case X86ISD::VSRLI: {
27003     if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27004       if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
27005         Known.setAllZero();
27006         break;
27007       }
27008
27009       DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
27010       unsigned ShAmt = ShiftImm->getZExtValue();
27011       if (Opc == X86ISD::VSHLI) {
27012         Known.Zero <<= ShAmt;
27013         Known.One <<= ShAmt;
27014         // Low bits are known zero.
27015         Known.Zero.setLowBits(ShAmt);
27016       } else {
27017         Known.Zero.lshrInPlace(ShAmt);
27018         Known.One.lshrInPlace(ShAmt);
27019         // High bits are known zero.
27020         Known.Zero.setHighBits(ShAmt);
27021       }
27022     }
27023     break;
27024   }
27025   case X86ISD::VZEXT: {
27026     SDValue N0 = Op.getOperand(0);
27027     unsigned NumElts = VT.getVectorNumElements();
27028
27029     EVT SrcVT = N0.getValueType();
27030     unsigned InNumElts = SrcVT.getVectorNumElements();
27031     unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27032     assert(InNumElts >= NumElts && "Illegal VZEXT input");
27033
27034     Known = KnownBits(InBitWidth);
27035     APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27036     DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27037     Known = Known.zext(BitWidth);
27038     Known.Zero.setBitsFrom(InBitWidth);
27039     break;
27040   }
27041   }
27042 }
27043
27044 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27045     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27046     unsigned Depth) const {
27047   unsigned VTBits = Op.getScalarValueSizeInBits();
27048   unsigned Opcode = Op.getOpcode();
27049   switch (Opcode) {
27050   case X86ISD::SETCC_CARRY:
27051     // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27052     return VTBits;
27053
27054   case X86ISD::VSEXT: {
27055     SDValue Src = Op.getOperand(0);
27056     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27057     Tmp += VTBits - Src.getScalarValueSizeInBits();
27058     return Tmp;
27059   }
27060
27061   case X86ISD::VSHLI: {
27062     SDValue Src = Op.getOperand(0);
27063     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27064     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27065     if (ShiftVal.uge(VTBits))
27066       return VTBits; // Shifted all bits out --> zero.
27067     if (ShiftVal.uge(Tmp))
27068       return 1; // Shifted all sign bits out --> unknown.
27069     return Tmp - ShiftVal.getZExtValue();
27070   }
27071
27072   case X86ISD::VSRAI: {
27073     SDValue Src = Op.getOperand(0);
27074     unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27075     APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27076     ShiftVal += Tmp;
27077     return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27078   }
27079
27080   case X86ISD::PCMPGT:
27081   case X86ISD::PCMPEQ:
27082   case X86ISD::CMPP:
27083   case X86ISD::VPCOM:
27084   case X86ISD::VPCOMU:
27085     // Vector compares return zero/all-bits result values.
27086     return VTBits;
27087   }
27088
27089   // Fallback case.
27090   return 1;
27091 }
27092
27093 /// Returns true (and the GlobalValue and the offset) if the node is a
27094 /// GlobalAddress + offset.
27095 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27096                                        const GlobalValue* &GA,
27097                                        int64_t &Offset) const {
27098   if (N->getOpcode() == X86ISD::Wrapper) {
27099     if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27100       GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27101       Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27102       return true;
27103     }
27104   }
27105   return TargetLowering::isGAPlusOffset(N, GA, Offset);
27106 }
27107
27108 // Attempt to match a combined shuffle mask against supported unary shuffle
27109 // instructions.
27110 // TODO: Investigate sharing more of this with shuffle lowering.
27111 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27112                                     bool AllowFloatDomain, bool AllowIntDomain,
27113                                     SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27114                                     const X86Subtarget &Subtarget,
27115                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27116   unsigned NumMaskElts = Mask.size();
27117   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27118
27119   // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27120   // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27121   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27122                          (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27123     unsigned MaxScale = 64 / MaskEltSize;
27124     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27125       bool Match = true;
27126       unsigned NumDstElts = NumMaskElts / Scale;
27127       for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27128         Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27129         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27130       }
27131       if (Match) {
27132         unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27133         SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
27134         if (SrcVT != MaskVT)
27135           V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27136         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27137         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27138         Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
27139                                   : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27140         return true;
27141       }
27142     }
27143   }
27144
27145   // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27146   if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27147       isUndefOrEqual(Mask[0], 0) &&
27148       isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27149     Shuffle = X86ISD::VZEXT_MOVL;
27150     SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27151     return true;
27152   }
27153
27154   // Check if we have SSE3 which will let us use MOVDDUP etc. The
27155   // instructions are no slower than UNPCKLPD but has the option to
27156   // fold the input operand into even an unaligned memory load.
27157   if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27158     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
27159       Shuffle = X86ISD::MOVDDUP;
27160       SrcVT = DstVT = MVT::v2f64;
27161       return true;
27162     }
27163     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27164       Shuffle = X86ISD::MOVSLDUP;
27165       SrcVT = DstVT = MVT::v4f32;
27166       return true;
27167     }
27168     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27169       Shuffle = X86ISD::MOVSHDUP;
27170       SrcVT = DstVT = MVT::v4f32;
27171       return true;
27172     }
27173   }
27174
27175   if (MaskVT.is256BitVector() && AllowFloatDomain) {
27176     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27177     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27178       Shuffle = X86ISD::MOVDDUP;
27179       SrcVT = DstVT = MVT::v4f64;
27180       return true;
27181     }
27182     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27183       Shuffle = X86ISD::MOVSLDUP;
27184       SrcVT = DstVT = MVT::v8f32;
27185       return true;
27186     }
27187     if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27188       Shuffle = X86ISD::MOVSHDUP;
27189       SrcVT = DstVT = MVT::v8f32;
27190       return true;
27191     }
27192   }
27193
27194   if (MaskVT.is512BitVector() && AllowFloatDomain) {
27195     assert(Subtarget.hasAVX512() &&
27196            "AVX512 required for 512-bit vector shuffles");
27197     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27198       Shuffle = X86ISD::MOVDDUP;
27199       SrcVT = DstVT = MVT::v8f64;
27200       return true;
27201     }
27202     if (isTargetShuffleEquivalent(
27203             Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27204       Shuffle = X86ISD::MOVSLDUP;
27205       SrcVT = DstVT = MVT::v16f32;
27206       return true;
27207     }
27208     if (isTargetShuffleEquivalent(
27209             Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27210       Shuffle = X86ISD::MOVSHDUP;
27211       SrcVT = DstVT = MVT::v16f32;
27212       return true;
27213     }
27214   }
27215
27216   // Attempt to match against broadcast-from-vector.
27217   if (Subtarget.hasAVX2()) {
27218     SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27219     if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27220       SrcVT = DstVT = MaskVT;
27221       Shuffle = X86ISD::VBROADCAST;
27222       return true;
27223     }
27224   }
27225
27226   return false;
27227 }
27228
27229 // Attempt to match a combined shuffle mask against supported unary immediate
27230 // permute instructions.
27231 // TODO: Investigate sharing more of this with shuffle lowering.
27232 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27233                                            const APInt &Zeroable,
27234                                            bool AllowFloatDomain,
27235                                            bool AllowIntDomain,
27236                                            const X86Subtarget &Subtarget,
27237                                            unsigned &Shuffle, MVT &ShuffleVT,
27238                                            unsigned &PermuteImm) {
27239   unsigned NumMaskElts = Mask.size();
27240   unsigned InputSizeInBits = MaskVT.getSizeInBits();
27241   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27242   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27243
27244   bool ContainsZeros =
27245       llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27246
27247   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27248   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27249     // Check for lane crossing permutes.
27250     if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27251       // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27252       if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27253         Shuffle = X86ISD::VPERMI;
27254         ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27255         PermuteImm = getV4X86ShuffleImm(Mask);
27256         return true;
27257       }
27258       if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27259         SmallVector<int, 4> RepeatedMask;
27260         if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27261           Shuffle = X86ISD::VPERMI;
27262           ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27263           PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27264           return true;
27265         }
27266       }
27267     } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27268       // VPERMILPD can permute with a non-repeating shuffle.
27269       Shuffle = X86ISD::VPERMILPI;
27270       ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27271       PermuteImm = 0;
27272       for (int i = 0, e = Mask.size(); i != e; ++i) {
27273         int M = Mask[i];
27274         if (M == SM_SentinelUndef)
27275           continue;
27276         assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27277         PermuteImm |= (M & 1) << i;
27278       }
27279       return true;
27280     }
27281   }
27282
27283   // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27284   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27285   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27286   if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27287       !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27288     SmallVector<int, 4> RepeatedMask;
27289     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27290       // Narrow the repeated mask to create 32-bit element permutes.
27291       SmallVector<int, 4> WordMask = RepeatedMask;
27292       if (MaskScalarSizeInBits == 64)
27293         scaleShuffleMask(2, RepeatedMask, WordMask);
27294
27295       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
27296       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
27297       ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27298       PermuteImm = getV4X86ShuffleImm(WordMask);
27299       return true;
27300     }
27301   }
27302
27303   // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27304   if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
27305     SmallVector<int, 4> RepeatedMask;
27306     if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27307       ArrayRef<int> LoMask(Mask.data() + 0, 4);
27308       ArrayRef<int> HiMask(Mask.data() + 4, 4);
27309
27310       // PSHUFLW: permute lower 4 elements only.
27311       if (isUndefOrInRange(LoMask, 0, 4) &&
27312           isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27313         Shuffle = X86ISD::PSHUFLW;
27314         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27315         PermuteImm = getV4X86ShuffleImm(LoMask);
27316         return true;
27317       }
27318
27319       // PSHUFHW: permute upper 4 elements only.
27320       if (isUndefOrInRange(HiMask, 4, 8) &&
27321           isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27322         // Offset the HiMask so that we can create the shuffle immediate.
27323         int OffsetHiMask[4];
27324         for (int i = 0; i != 4; ++i)
27325           OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27326
27327         Shuffle = X86ISD::PSHUFHW;
27328         ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27329         PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27330         return true;
27331       }
27332     }
27333   }
27334
27335   // Attempt to match against byte/bit shifts.
27336   // FIXME: Add 512-bit support.
27337   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27338                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27339     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27340                                              MaskScalarSizeInBits, Mask,
27341                                              0, Zeroable, Subtarget);
27342     if (0 < ShiftAmt) {
27343       PermuteImm = (unsigned)ShiftAmt;
27344       return true;
27345     }
27346   }
27347
27348   return false;
27349 }
27350
27351 // Attempt to match a combined unary shuffle mask against supported binary
27352 // shuffle instructions.
27353 // TODO: Investigate sharing more of this with shuffle lowering.
27354 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27355                                      bool AllowFloatDomain, bool AllowIntDomain,
27356                                      SDValue &V1, SDValue &V2, SDLoc &DL,
27357                                      SelectionDAG &DAG,
27358                                      const X86Subtarget &Subtarget,
27359                                      unsigned &Shuffle, MVT &ShuffleVT,
27360                                      bool IsUnary) {
27361   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27362
27363   if (MaskVT.is128BitVector()) {
27364     if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27365       V2 = V1;
27366       Shuffle = X86ISD::MOVLHPS;
27367       ShuffleVT = MVT::v4f32;
27368       return true;
27369     }
27370     if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27371       V2 = V1;
27372       Shuffle = X86ISD::MOVHLPS;
27373       ShuffleVT = MVT::v4f32;
27374       return true;
27375     }
27376     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27377         (AllowFloatDomain || !Subtarget.hasSSE41())) {
27378       std::swap(V1, V2);
27379       Shuffle = X86ISD::MOVSD;
27380       ShuffleVT = MaskVT;
27381       return true;
27382     }
27383     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27384         (AllowFloatDomain || !Subtarget.hasSSE41())) {
27385       Shuffle = X86ISD::MOVSS;
27386       ShuffleVT = MaskVT;
27387       return true;
27388     }
27389   }
27390
27391   // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27392   if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27393       (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27394       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27395       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27396       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27397     if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27398                                     DAG, Subtarget)) {
27399       ShuffleVT = MaskVT;
27400       if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27401         ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27402       return true;
27403     }
27404   }
27405
27406   return false;
27407 }
27408
27409 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27410                                             const APInt &Zeroable,
27411                                             bool AllowFloatDomain,
27412                                             bool AllowIntDomain,
27413                                             SDValue &V1, SDValue &V2, SDLoc &DL,
27414                                             SelectionDAG &DAG,
27415                                             const X86Subtarget &Subtarget,
27416                                             unsigned &Shuffle, MVT &ShuffleVT,
27417                                             unsigned &PermuteImm) {
27418   unsigned NumMaskElts = Mask.size();
27419   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27420
27421   // Attempt to match against PALIGNR byte rotate.
27422   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27423                          (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27424     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27425     if (0 < ByteRotation) {
27426       Shuffle = X86ISD::PALIGNR;
27427       ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27428       PermuteImm = ByteRotation;
27429       return true;
27430     }
27431   }
27432
27433   // Attempt to combine to X86ISD::BLENDI.
27434   if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27435                             (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27436       (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27437     uint64_t BlendMask = 0;
27438     bool ForceV1Zero = false, ForceV2Zero = false;
27439     SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27440     if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27441                                   BlendMask)) {
27442       if (MaskVT == MVT::v16i16) {
27443         // We can only use v16i16 PBLENDW if the lanes are repeated.
27444         SmallVector<int, 8> RepeatedMask;
27445         if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27446                                         RepeatedMask)) {
27447           assert(RepeatedMask.size() == 8 &&
27448                  "Repeated mask size doesn't match!");
27449           PermuteImm = 0;
27450           for (int i = 0; i < 8; ++i)
27451             if (RepeatedMask[i] >= 8)
27452               PermuteImm |= 1 << i;
27453           V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27454           V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27455           Shuffle = X86ISD::BLENDI;
27456           ShuffleVT = MaskVT;
27457           return true;
27458         }
27459       } else {
27460         // Determine a type compatible with X86ISD::BLENDI.
27461         ShuffleVT = MaskVT;
27462         if (Subtarget.hasAVX2()) {
27463           if (ShuffleVT == MVT::v4i64)
27464             ShuffleVT = MVT::v8i32;
27465           else if (ShuffleVT == MVT::v2i64)
27466             ShuffleVT = MVT::v4i32;
27467         } else {
27468           if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27469             ShuffleVT = MVT::v8i16;
27470           else if (ShuffleVT == MVT::v4i64)
27471             ShuffleVT = MVT::v4f64;
27472           else if (ShuffleVT == MVT::v8i32)
27473             ShuffleVT = MVT::v8f32;
27474         }
27475
27476         if (!ShuffleVT.isFloatingPoint()) {
27477           int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27478           BlendMask =
27479               scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27480           ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27481           ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27482         }
27483
27484         V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27485         V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27486         PermuteImm = (unsigned)BlendMask;
27487         Shuffle = X86ISD::BLENDI;
27488         return true;
27489       }
27490     }
27491   }
27492
27493   // Attempt to combine to INSERTPS.
27494   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27495       MaskVT.is128BitVector()) {
27496     if (Zeroable.getBoolValue() &&
27497         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27498       Shuffle = X86ISD::INSERTPS;
27499       ShuffleVT = MVT::v4f32;
27500       return true;
27501     }
27502   }
27503
27504   // Attempt to combine to SHUFPD.
27505   if (AllowFloatDomain && EltSizeInBits == 64 &&
27506       ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27507        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27508        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27509     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27510       Shuffle = X86ISD::SHUFP;
27511       ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27512       return true;
27513     }
27514   }
27515
27516   // Attempt to combine to SHUFPS.
27517   if (AllowFloatDomain && EltSizeInBits == 32 &&
27518       ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27519        (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27520        (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27521     SmallVector<int, 4> RepeatedMask;
27522     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27523       // Match each half of the repeated mask, to determine if its just
27524       // referencing one of the vectors, is zeroable or entirely undef.
27525       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27526         int M0 = RepeatedMask[Offset];
27527         int M1 = RepeatedMask[Offset + 1];
27528
27529         if (isUndefInRange(RepeatedMask, Offset, 2)) {
27530           return DAG.getUNDEF(MaskVT);
27531         } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27532           S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27533           S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27534           return getZeroVector(MaskVT, Subtarget, DAG, DL);
27535         } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27536           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27537           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27538           return V1;
27539         } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27540           S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27541           S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27542           return V2;
27543         }
27544
27545         return SDValue();
27546       };
27547
27548       int ShufMask[4] = {-1, -1, -1, -1};
27549       SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27550       SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27551
27552       if (Lo && Hi) {
27553         V1 = Lo;
27554         V2 = Hi;
27555         Shuffle = X86ISD::SHUFP;
27556         ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27557         PermuteImm = getV4X86ShuffleImm(ShufMask);
27558         return true;
27559       }
27560     }
27561   }
27562
27563   return false;
27564 }
27565
27566 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27567 /// possible.
27568 ///
27569 /// This is the leaf of the recursive combine below. When we have found some
27570 /// chain of single-use x86 shuffle instructions and accumulated the combined
27571 /// shuffle mask represented by them, this will try to pattern match that mask
27572 /// into either a single instruction if there is a special purpose instruction
27573 /// for this operation, or into a PSHUFB instruction which is a fully general
27574 /// instruction but should only be used to replace chains over a certain depth.
27575 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27576                                    ArrayRef<int> BaseMask, int Depth,
27577                                    bool HasVariableMask, SelectionDAG &DAG,
27578                                    TargetLowering::DAGCombinerInfo &DCI,
27579                                    const X86Subtarget &Subtarget) {
27580   assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27581   assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27582          "Unexpected number of shuffle inputs!");
27583
27584   // Find the inputs that enter the chain. Note that multiple uses are OK
27585   // here, we're not going to remove the operands we find.
27586   bool UnaryShuffle = (Inputs.size() == 1);
27587   SDValue V1 = peekThroughBitcasts(Inputs[0]);
27588   SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27589                              : peekThroughBitcasts(Inputs[1]));
27590
27591   MVT VT1 = V1.getSimpleValueType();
27592   MVT VT2 = V2.getSimpleValueType();
27593   MVT RootVT = Root.getSimpleValueType();
27594   assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27595          VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27596          "Vector size mismatch");
27597
27598   SDLoc DL(Root);
27599   SDValue Res;
27600
27601   unsigned NumBaseMaskElts = BaseMask.size();
27602   if (NumBaseMaskElts == 1) {
27603     assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27604     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27605                   /*AddTo*/ true);
27606     return true;
27607   }
27608
27609   unsigned RootSizeInBits = RootVT.getSizeInBits();
27610   unsigned NumRootElts = RootVT.getVectorNumElements();
27611   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27612   bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27613                      (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27614
27615   // Don't combine if we are a AVX512/EVEX target and the mask element size
27616   // is different from the root element size - this would prevent writemasks
27617   // from being reused.
27618   // TODO - this currently prevents all lane shuffles from occurring.
27619   // TODO - check for writemasks usage instead of always preventing combining.
27620   // TODO - attempt to narrow Mask back to writemask size.
27621   bool IsEVEXShuffle =
27622       RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27623   if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27624     return false;
27625
27626   // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27627
27628   // Handle 128-bit lane shuffles of 256-bit vectors.
27629   // TODO - this should support binary shuffles.
27630   if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27631       !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27632     if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27633       return false; // Nothing to do!
27634     MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27635     unsigned PermMask = 0;
27636     PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27637     PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27638
27639     Res = DAG.getBitcast(ShuffleVT, V1);
27640     DCI.AddToWorklist(Res.getNode());
27641     Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27642                       DAG.getUNDEF(ShuffleVT),
27643                       DAG.getConstant(PermMask, DL, MVT::i8));
27644     DCI.AddToWorklist(Res.getNode());
27645     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27646                   /*AddTo*/ true);
27647     return true;
27648   }
27649
27650   // For masks that have been widened to 128-bit elements or more,
27651   // narrow back down to 64-bit elements.
27652   SmallVector<int, 64> Mask;
27653   if (BaseMaskEltSizeInBits > 64) {
27654     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27655     int MaskScale = BaseMaskEltSizeInBits / 64;
27656     scaleShuffleMask(MaskScale, BaseMask, Mask);
27657   } else {
27658     Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27659   }
27660
27661   unsigned NumMaskElts = Mask.size();
27662   unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27663
27664   // Determine the effective mask value type.
27665   FloatDomain &= (32 <= MaskEltSizeInBits);
27666   MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27667                            : MVT::getIntegerVT(MaskEltSizeInBits);
27668   MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27669
27670   // Only allow legal mask types.
27671   if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27672     return false;
27673
27674   // Attempt to match the mask against known shuffle patterns.
27675   MVT ShuffleSrcVT, ShuffleVT;
27676   unsigned Shuffle, PermuteImm;
27677
27678   // Which shuffle domains are permitted?
27679   // Permit domain crossing at higher combine depths.
27680   bool AllowFloatDomain = FloatDomain || (Depth > 3);
27681   bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
27682                         (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
27683
27684   // Determine zeroable mask elements.
27685   APInt Zeroable(NumMaskElts, 0);
27686   for (unsigned i = 0; i != NumMaskElts; ++i)
27687     if (isUndefOrZero(Mask[i]))
27688       Zeroable.setBit(i);
27689
27690   if (UnaryShuffle) {
27691     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27692     // directly if we don't shuffle the lower element and we shuffle the upper
27693     // (zero) elements within themselves.
27694     if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27695         (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27696       unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27697       ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27698       if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27699           isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27700         DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27701                       /*AddTo*/ true);
27702         return true;
27703       }
27704     }
27705
27706     if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27707                                 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27708                                 ShuffleVT)) {
27709       if (Depth == 1 && Root.getOpcode() == Shuffle)
27710         return false; // Nothing to do!
27711       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27712         return false; // AVX512 Writemask clash.
27713       Res = DAG.getBitcast(ShuffleSrcVT, V1);
27714       DCI.AddToWorklist(Res.getNode());
27715       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27716       DCI.AddToWorklist(Res.getNode());
27717       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27718                     /*AddTo*/ true);
27719       return true;
27720     }
27721
27722     if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27723                                        AllowIntDomain, Subtarget, Shuffle,
27724                                        ShuffleVT, PermuteImm)) {
27725       if (Depth == 1 && Root.getOpcode() == Shuffle)
27726         return false; // Nothing to do!
27727       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27728         return false; // AVX512 Writemask clash.
27729       Res = DAG.getBitcast(ShuffleVT, V1);
27730       DCI.AddToWorklist(Res.getNode());
27731       Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27732                         DAG.getConstant(PermuteImm, DL, MVT::i8));
27733       DCI.AddToWorklist(Res.getNode());
27734       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27735                     /*AddTo*/ true);
27736       return true;
27737     }
27738   }
27739
27740   if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27741                                V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27742                                UnaryShuffle)) {
27743     if (Depth == 1 && Root.getOpcode() == Shuffle)
27744       return false; // Nothing to do!
27745     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27746       return false; // AVX512 Writemask clash.
27747     V1 = DAG.getBitcast(ShuffleVT, V1);
27748     DCI.AddToWorklist(V1.getNode());
27749     V2 = DAG.getBitcast(ShuffleVT, V2);
27750     DCI.AddToWorklist(V2.getNode());
27751     Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27752     DCI.AddToWorklist(Res.getNode());
27753     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27754                   /*AddTo*/ true);
27755     return true;
27756   }
27757
27758   if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27759                                       AllowIntDomain, V1, V2, DL, DAG,
27760                                       Subtarget, Shuffle, ShuffleVT,
27761                                       PermuteImm)) {
27762     if (Depth == 1 && Root.getOpcode() == Shuffle)
27763       return false; // Nothing to do!
27764     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27765       return false; // AVX512 Writemask clash.
27766     V1 = DAG.getBitcast(ShuffleVT, V1);
27767     DCI.AddToWorklist(V1.getNode());
27768     V2 = DAG.getBitcast(ShuffleVT, V2);
27769     DCI.AddToWorklist(V2.getNode());
27770     Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27771                       DAG.getConstant(PermuteImm, DL, MVT::i8));
27772     DCI.AddToWorklist(Res.getNode());
27773     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27774                   /*AddTo*/ true);
27775     return true;
27776   }
27777
27778   // Typically from here on, we need an integer version of MaskVT.
27779   MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
27780   IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
27781
27782   // Annoyingly, SSE4A instructions don't map into the above match helpers.
27783   if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
27784     uint64_t BitLen, BitIdx;
27785     if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
27786                                   Zeroable)) {
27787       if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
27788         return false; // Nothing to do!
27789       V1 = DAG.getBitcast(IntMaskVT, V1);
27790       DCI.AddToWorklist(V1.getNode());
27791       Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
27792                         DAG.getConstant(BitLen, DL, MVT::i8),
27793                         DAG.getConstant(BitIdx, DL, MVT::i8));
27794       DCI.AddToWorklist(Res.getNode());
27795       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27796                     /*AddTo*/ true);
27797       return true;
27798     }
27799
27800     if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
27801       if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
27802         return false; // Nothing to do!
27803       V1 = DAG.getBitcast(IntMaskVT, V1);
27804       DCI.AddToWorklist(V1.getNode());
27805       V2 = DAG.getBitcast(IntMaskVT, V2);
27806       DCI.AddToWorklist(V2.getNode());
27807       Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
27808                         DAG.getConstant(BitLen, DL, MVT::i8),
27809                         DAG.getConstant(BitIdx, DL, MVT::i8));
27810       DCI.AddToWorklist(Res.getNode());
27811       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27812                     /*AddTo*/ true);
27813       return true;
27814     }
27815   }
27816
27817   // Don't try to re-form single instruction chains under any circumstances now
27818   // that we've done encoding canonicalization for them.
27819   if (Depth < 2)
27820     return false;
27821
27822   bool MaskContainsZeros =
27823       any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27824
27825   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27826     // If we have a single input lane-crossing shuffle then lower to VPERMV.
27827     if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27828         ((Subtarget.hasAVX2() &&
27829           (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27830          (Subtarget.hasAVX512() &&
27831           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27832            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27833          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27834          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27835          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27836          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27837       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27838       DCI.AddToWorklist(VPermMask.getNode());
27839       Res = DAG.getBitcast(MaskVT, V1);
27840       DCI.AddToWorklist(Res.getNode());
27841       Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27842       DCI.AddToWorklist(Res.getNode());
27843       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27844                     /*AddTo*/ true);
27845       return true;
27846     }
27847
27848     // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27849     // vector as the second source.
27850     if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27851         ((Subtarget.hasAVX512() &&
27852           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27853            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27854          (Subtarget.hasVLX() &&
27855           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27856            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27857          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27858          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27859          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27860          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27861       // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27862       for (unsigned i = 0; i != NumMaskElts; ++i)
27863         if (Mask[i] == SM_SentinelZero)
27864           Mask[i] = NumMaskElts + i;
27865
27866       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27867       DCI.AddToWorklist(VPermMask.getNode());
27868       Res = DAG.getBitcast(MaskVT, V1);
27869       DCI.AddToWorklist(Res.getNode());
27870       SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27871       DCI.AddToWorklist(Zero.getNode());
27872       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27873       DCI.AddToWorklist(Res.getNode());
27874       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27875                     /*AddTo*/ true);
27876       return true;
27877     }
27878
27879     // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27880     if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27881         ((Subtarget.hasAVX512() &&
27882           (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27883            MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27884          (Subtarget.hasVLX() &&
27885           (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27886            MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27887          (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27888          (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27889          (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27890          (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27891       SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27892       DCI.AddToWorklist(VPermMask.getNode());
27893       V1 = DAG.getBitcast(MaskVT, V1);
27894       DCI.AddToWorklist(V1.getNode());
27895       V2 = DAG.getBitcast(MaskVT, V2);
27896       DCI.AddToWorklist(V2.getNode());
27897       Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27898       DCI.AddToWorklist(Res.getNode());
27899       DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27900                     /*AddTo*/ true);
27901       return true;
27902     }
27903     return false;
27904   }
27905
27906   // See if we can combine a single input shuffle with zeros to a bit-mask,
27907   // which is much simpler than any shuffle.
27908   if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27909       isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27910       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27911     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27912     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27913     APInt UndefElts(NumMaskElts, 0);
27914     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27915     for (unsigned i = 0; i != NumMaskElts; ++i) {
27916       int M = Mask[i];
27917       if (M == SM_SentinelUndef) {
27918         UndefElts.setBit(i);
27919         continue;
27920       }
27921       if (M == SM_SentinelZero)
27922         continue;
27923       EltBits[i] = AllOnes;
27924     }
27925     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27926     DCI.AddToWorklist(BitMask.getNode());
27927     Res = DAG.getBitcast(MaskVT, V1);
27928     DCI.AddToWorklist(Res.getNode());
27929     unsigned AndOpcode =
27930         FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27931     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27932     DCI.AddToWorklist(Res.getNode());
27933     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27934                   /*AddTo*/ true);
27935     return true;
27936   }
27937
27938   // If we have a single input shuffle with different shuffle patterns in the
27939   // the 128-bit lanes use the variable mask to VPERMILPS.
27940   // TODO Combine other mask types at higher depths.
27941   if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27942       ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27943        (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27944     SmallVector<SDValue, 16> VPermIdx;
27945     for (int M : Mask) {
27946       SDValue Idx =
27947           M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27948       VPermIdx.push_back(Idx);
27949     }
27950     SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
27951     DCI.AddToWorklist(VPermMask.getNode());
27952     Res = DAG.getBitcast(MaskVT, V1);
27953     DCI.AddToWorklist(Res.getNode());
27954     Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27955     DCI.AddToWorklist(Res.getNode());
27956     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27957                   /*AddTo*/ true);
27958     return true;
27959   }
27960
27961   // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27962   // to VPERMIL2PD/VPERMIL2PS.
27963   if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27964       (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27965        MaskVT == MVT::v8f32)) {
27966     // VPERMIL2 Operation.
27967     // Bits[3] - Match Bit.
27968     // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27969     // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27970     unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27971     unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27972     SmallVector<int, 8> VPerm2Idx;
27973     unsigned M2ZImm = 0;
27974     for (int M : Mask) {
27975       if (M == SM_SentinelUndef) {
27976         VPerm2Idx.push_back(-1);
27977         continue;
27978       }
27979       if (M == SM_SentinelZero) {
27980         M2ZImm = 2;
27981         VPerm2Idx.push_back(8);
27982         continue;
27983       }
27984       int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27985       Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27986       VPerm2Idx.push_back(Index);
27987     }
27988     V1 = DAG.getBitcast(MaskVT, V1);
27989     DCI.AddToWorklist(V1.getNode());
27990     V2 = DAG.getBitcast(MaskVT, V2);
27991     DCI.AddToWorklist(V2.getNode());
27992     SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
27993     DCI.AddToWorklist(VPerm2MaskOp.getNode());
27994     Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27995                       DAG.getConstant(M2ZImm, DL, MVT::i8));
27996     DCI.AddToWorklist(Res.getNode());
27997     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27998                   /*AddTo*/ true);
27999     return true;
28000   }
28001
28002   // If we have 3 or more shuffle instructions or a chain involving a variable
28003   // mask, we can replace them with a single PSHUFB instruction profitably.
28004   // Intel's manuals suggest only using PSHUFB if doing so replacing 5
28005   // instructions, but in practice PSHUFB tends to be *very* fast so we're
28006   // more aggressive.
28007   if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
28008       ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28009        (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
28010        (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
28011     SmallVector<SDValue, 16> PSHUFBMask;
28012     int NumBytes = RootVT.getSizeInBits() / 8;
28013     int Ratio = NumBytes / NumMaskElts;
28014     for (int i = 0; i < NumBytes; ++i) {
28015       int M = Mask[i / Ratio];
28016       if (M == SM_SentinelUndef) {
28017         PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28018         continue;
28019       }
28020       if (M == SM_SentinelZero) {
28021         PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28022         continue;
28023       }
28024       M = Ratio * M + i % Ratio;
28025       assert ((M / 16) == (i / 16) && "Lane crossing detected");
28026       PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28027     }
28028     MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28029     Res = DAG.getBitcast(ByteVT, V1);
28030     DCI.AddToWorklist(Res.getNode());
28031     SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28032     DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28033     Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28034     DCI.AddToWorklist(Res.getNode());
28035     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28036                   /*AddTo*/ true);
28037     return true;
28038   }
28039
28040   // With XOP, if we have a 128-bit binary input shuffle we can always combine
28041   // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28042   // slower than PSHUFB on targets that support both.
28043   if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
28044       Subtarget.hasXOP()) {
28045     // VPPERM Mask Operation
28046     // Bits[4:0] - Byte Index (0 - 31)
28047     // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28048     SmallVector<SDValue, 16> VPPERMMask;
28049     int NumBytes = 16;
28050     int Ratio = NumBytes / NumMaskElts;
28051     for (int i = 0; i < NumBytes; ++i) {
28052       int M = Mask[i / Ratio];
28053       if (M == SM_SentinelUndef) {
28054         VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28055         continue;
28056       }
28057       if (M == SM_SentinelZero) {
28058         VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28059         continue;
28060       }
28061       M = Ratio * M + i % Ratio;
28062       VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28063     }
28064     MVT ByteVT = MVT::v16i8;
28065     V1 = DAG.getBitcast(ByteVT, V1);
28066     DCI.AddToWorklist(V1.getNode());
28067     V2 = DAG.getBitcast(ByteVT, V2);
28068     DCI.AddToWorklist(V2.getNode());
28069     SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28070     DCI.AddToWorklist(VPPERMMaskOp.getNode());
28071     Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28072     DCI.AddToWorklist(Res.getNode());
28073     DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28074                   /*AddTo*/ true);
28075     return true;
28076   }
28077
28078   // Failed to find any combines.
28079   return false;
28080 }
28081
28082 // Attempt to constant fold all of the constant source ops.
28083 // Returns true if the entire shuffle is folded to a constant.
28084 // TODO: Extend this to merge multiple constant Ops and update the mask.
28085 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28086                                         ArrayRef<int> Mask, SDValue Root,
28087                                         bool HasVariableMask, SelectionDAG &DAG,
28088                                         TargetLowering::DAGCombinerInfo &DCI,
28089                                         const X86Subtarget &Subtarget) {
28090   MVT VT = Root.getSimpleValueType();
28091
28092   unsigned SizeInBits = VT.getSizeInBits();
28093   unsigned NumMaskElts = Mask.size();
28094   unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28095   unsigned NumOps = Ops.size();
28096
28097   // Extract constant bits from each source op.
28098   bool OneUseConstantOp = false;
28099   SmallVector<APInt, 16> UndefEltsOps(NumOps);
28100   SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28101   for (unsigned i = 0; i != NumOps; ++i) {
28102     SDValue SrcOp = Ops[i];
28103     OneUseConstantOp |= SrcOp.hasOneUse();
28104     if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28105                                        RawBitsOps[i]))
28106       return false;
28107   }
28108
28109   // Only fold if at least one of the constants is only used once or
28110   // the combined shuffle has included a variable mask shuffle, this
28111   // is to avoid constant pool bloat.
28112   if (!OneUseConstantOp && !HasVariableMask)
28113     return false;
28114
28115   // Shuffle the constant bits according to the mask.
28116   APInt UndefElts(NumMaskElts, 0);
28117   APInt ZeroElts(NumMaskElts, 0);
28118   APInt ConstantElts(NumMaskElts, 0);
28119   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28120                                         APInt::getNullValue(MaskSizeInBits));
28121   for (unsigned i = 0; i != NumMaskElts; ++i) {
28122     int M = Mask[i];
28123     if (M == SM_SentinelUndef) {
28124       UndefElts.setBit(i);
28125       continue;
28126     } else if (M == SM_SentinelZero) {
28127       ZeroElts.setBit(i);
28128       continue;
28129     }
28130     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28131
28132     unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28133     unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28134
28135     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28136     if (SrcUndefElts[SrcMaskIdx]) {
28137       UndefElts.setBit(i);
28138       continue;
28139     }
28140
28141     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28142     APInt &Bits = SrcEltBits[SrcMaskIdx];
28143     if (!Bits) {
28144       ZeroElts.setBit(i);
28145       continue;
28146     }
28147
28148     ConstantElts.setBit(i);
28149     ConstantBitData[i] = Bits;
28150   }
28151   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28152
28153   // Create the constant data.
28154   MVT MaskSVT;
28155   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28156     MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28157   else
28158     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28159
28160   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28161
28162   SDLoc DL(Root);
28163   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28164   DCI.AddToWorklist(CstOp.getNode());
28165   DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
28166   return true;
28167 }
28168
28169 /// \brief Fully generic combining of x86 shuffle instructions.
28170 ///
28171 /// This should be the last combine run over the x86 shuffle instructions. Once
28172 /// they have been fully optimized, this will recursively consider all chains
28173 /// of single-use shuffle instructions, build a generic model of the cumulative
28174 /// shuffle operation, and check for simpler instructions which implement this
28175 /// operation. We use this primarily for two purposes:
28176 ///
28177 /// 1) Collapse generic shuffles to specialized single instructions when
28178 ///    equivalent. In most cases, this is just an encoding size win, but
28179 ///    sometimes we will collapse multiple generic shuffles into a single
28180 ///    special-purpose shuffle.
28181 /// 2) Look for sequences of shuffle instructions with 3 or more total
28182 ///    instructions, and replace them with the slightly more expensive SSSE3
28183 ///    PSHUFB instruction if available. We do this as the last combining step
28184 ///    to ensure we avoid using PSHUFB if we can implement the shuffle with
28185 ///    a suitable short sequence of other instructions. The PSHUFB will either
28186 ///    use a register or have to read from memory and so is slightly (but only
28187 ///    slightly) more expensive than the other shuffle instructions.
28188 ///
28189 /// Because this is inherently a quadratic operation (for each shuffle in
28190 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28191 /// This should never be an issue in practice as the shuffle lowering doesn't
28192 /// produce sequences of more than 8 instructions.
28193 ///
28194 /// FIXME: We will currently miss some cases where the redundant shuffling
28195 /// would simplify under the threshold for PSHUFB formation because of
28196 /// combine-ordering. To fix this, we should do the redundant instruction
28197 /// combining in this recursive walk.
28198 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28199                                           int SrcOpIndex, SDValue Root,
28200                                           ArrayRef<int> RootMask,
28201                                           ArrayRef<const SDNode*> SrcNodes,
28202                                           int Depth, bool HasVariableMask,
28203                                           SelectionDAG &DAG,
28204                                           TargetLowering::DAGCombinerInfo &DCI,
28205                                           const X86Subtarget &Subtarget) {
28206   // Bound the depth of our recursive combine because this is ultimately
28207   // quadratic in nature.
28208   if (Depth > 8)
28209     return false;
28210
28211   // Directly rip through bitcasts to find the underlying operand.
28212   SDValue Op = SrcOps[SrcOpIndex];
28213   Op = peekThroughOneUseBitcasts(Op);
28214
28215   MVT VT = Op.getSimpleValueType();
28216   if (!VT.isVector())
28217     return false; // Bail if we hit a non-vector.
28218
28219   assert(Root.getSimpleValueType().isVector() &&
28220          "Shuffles operate on vector types!");
28221   assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28222          "Can only combine shuffles of the same vector register size.");
28223
28224   // Extract target shuffle mask and resolve sentinels and inputs.
28225   SmallVector<int, 64> OpMask;
28226   SmallVector<SDValue, 2> OpInputs;
28227   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28228     return false;
28229
28230   assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28231   SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28232   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28233
28234   // Add the inputs to the Ops list, avoiding duplicates.
28235   SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28236
28237   int InputIdx0 = -1, InputIdx1 = -1;
28238   for (int i = 0, e = Ops.size(); i < e; ++i) {
28239     SDValue BC = peekThroughBitcasts(Ops[i]);
28240     if (Input0 && BC == peekThroughBitcasts(Input0))
28241       InputIdx0 = i;
28242     if (Input1 && BC == peekThroughBitcasts(Input1))
28243       InputIdx1 = i;
28244   }
28245
28246   if (Input0 && InputIdx0 < 0) {
28247     InputIdx0 = SrcOpIndex;
28248     Ops[SrcOpIndex] = Input0;
28249   }
28250   if (Input1 && InputIdx1 < 0) {
28251     InputIdx1 = Ops.size();
28252     Ops.push_back(Input1);
28253   }
28254
28255   assert(((RootMask.size() > OpMask.size() &&
28256            RootMask.size() % OpMask.size() == 0) ||
28257           (OpMask.size() > RootMask.size() &&
28258            OpMask.size() % RootMask.size() == 0) ||
28259           OpMask.size() == RootMask.size()) &&
28260          "The smaller number of elements must divide the larger.");
28261
28262   // This function can be performance-critical, so we rely on the power-of-2
28263   // knowledge that we have about the mask sizes to replace div/rem ops with
28264   // bit-masks and shifts.
28265   assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28266   assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28267   unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28268   unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28269
28270   unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28271   unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28272   unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28273   assert((RootRatio == 1 || OpRatio == 1) &&
28274          "Must not have a ratio for both incoming and op masks!");
28275
28276   assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28277   assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28278   assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28279   unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28280   unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28281
28282   SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28283
28284   // Merge this shuffle operation's mask into our accumulated mask. Note that
28285   // this shuffle's mask will be the first applied to the input, followed by the
28286   // root mask to get us all the way to the root value arrangement. The reason
28287   // for this order is that we are recursing up the operation chain.
28288   for (unsigned i = 0; i < MaskWidth; ++i) {
28289     unsigned RootIdx = i >> RootRatioLog2;
28290     if (RootMask[RootIdx] < 0) {
28291       // This is a zero or undef lane, we're done.
28292       Mask[i] = RootMask[RootIdx];
28293       continue;
28294     }
28295
28296     unsigned RootMaskedIdx =
28297         RootRatio == 1
28298             ? RootMask[RootIdx]
28299             : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28300
28301     // Just insert the scaled root mask value if it references an input other
28302     // than the SrcOp we're currently inserting.
28303     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28304         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28305       Mask[i] = RootMaskedIdx;
28306       continue;
28307     }
28308
28309     RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28310     unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28311     if (OpMask[OpIdx] < 0) {
28312       // The incoming lanes are zero or undef, it doesn't matter which ones we
28313       // are using.
28314       Mask[i] = OpMask[OpIdx];
28315       continue;
28316     }
28317
28318     // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28319     unsigned OpMaskedIdx =
28320         OpRatio == 1
28321             ? OpMask[OpIdx]
28322             : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28323
28324     OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28325     if (OpMask[OpIdx] < (int)OpMask.size()) {
28326       assert(0 <= InputIdx0 && "Unknown target shuffle input");
28327       OpMaskedIdx += InputIdx0 * MaskWidth;
28328     } else {
28329       assert(0 <= InputIdx1 && "Unknown target shuffle input");
28330       OpMaskedIdx += InputIdx1 * MaskWidth;
28331     }
28332
28333     Mask[i] = OpMaskedIdx;
28334   }
28335
28336   // Handle the all undef/zero cases early.
28337   if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28338     DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28339     return true;
28340   }
28341   if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28342     // TODO - should we handle the mixed zero/undef case as well? Just returning
28343     // a zero mask will lose information on undef elements possibly reducing
28344     // future combine possibilities.
28345     DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28346                                                 Subtarget, DAG, SDLoc(Root)));
28347     return true;
28348   }
28349
28350   // Remove unused shuffle source ops.
28351   resolveTargetShuffleInputsAndMask(Ops, Mask);
28352   assert(!Ops.empty() && "Shuffle with no inputs detected");
28353
28354   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28355
28356   // Update the list of shuffle nodes that have been combined so far.
28357   SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28358                                                 SrcNodes.end());
28359   CombinedNodes.push_back(Op.getNode());
28360
28361   // See if we can recurse into each shuffle source op (if it's a target
28362   // shuffle). The source op should only be combined if it either has a
28363   // single use (i.e. current Op) or all its users have already been combined.
28364   for (int i = 0, e = Ops.size(); i < e; ++i)
28365     if (Ops[i].getNode()->hasOneUse() ||
28366         SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28367       if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28368                                         Depth + 1, HasVariableMask, DAG, DCI,
28369                                         Subtarget))
28370         return true;
28371
28372   // Attempt to constant fold all of the constant source ops.
28373   if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28374                                   Subtarget))
28375     return true;
28376
28377   // We can only combine unary and binary shuffle mask cases.
28378   if (Ops.size() > 2)
28379     return false;
28380
28381   // Minor canonicalization of the accumulated shuffle mask to make it easier
28382   // to match below. All this does is detect masks with sequential pairs of
28383   // elements, and shrink them to the half-width mask. It does this in a loop
28384   // so it will reduce the size of the mask to the minimal width mask which
28385   // performs an equivalent shuffle.
28386   SmallVector<int, 64> WidenedMask;
28387   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28388     Mask = std::move(WidenedMask);
28389   }
28390
28391   // Canonicalization of binary shuffle masks to improve pattern matching by
28392   // commuting the inputs.
28393   if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28394     ShuffleVectorSDNode::commuteMask(Mask);
28395     std::swap(Ops[0], Ops[1]);
28396   }
28397
28398   return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28399                                 DCI, Subtarget);
28400 }
28401
28402 /// \brief Get the PSHUF-style mask from PSHUF node.
28403 ///
28404 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28405 /// PSHUF-style masks that can be reused with such instructions.
28406 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28407   MVT VT = N.getSimpleValueType();
28408   SmallVector<int, 4> Mask;
28409   SmallVector<SDValue, 2> Ops;
28410   bool IsUnary;
28411   bool HaveMask =
28412       getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28413   (void)HaveMask;
28414   assert(HaveMask);
28415
28416   // If we have more than 128-bits, only the low 128-bits of shuffle mask
28417   // matter. Check that the upper masks are repeats and remove them.
28418   if (VT.getSizeInBits() > 128) {
28419     int LaneElts = 128 / VT.getScalarSizeInBits();
28420 #ifndef NDEBUG
28421     for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28422       for (int j = 0; j < LaneElts; ++j)
28423         assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28424                "Mask doesn't repeat in high 128-bit lanes!");
28425 #endif
28426     Mask.resize(LaneElts);
28427   }
28428
28429   switch (N.getOpcode()) {
28430   case X86ISD::PSHUFD:
28431     return Mask;
28432   case X86ISD::PSHUFLW:
28433     Mask.resize(4);
28434     return Mask;
28435   case X86ISD::PSHUFHW:
28436     Mask.erase(Mask.begin(), Mask.begin() + 4);
28437     for (int &M : Mask)
28438       M -= 4;
28439     return Mask;
28440   default:
28441     llvm_unreachable("No valid shuffle instruction found!");
28442   }
28443 }
28444
28445 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28446 ///
28447 /// We walk up the chain and look for a combinable shuffle, skipping over
28448 /// shuffles that we could hoist this shuffle's transformation past without
28449 /// altering anything.
28450 static SDValue
28451 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28452                              SelectionDAG &DAG) {
28453   assert(N.getOpcode() == X86ISD::PSHUFD &&
28454          "Called with something other than an x86 128-bit half shuffle!");
28455   SDLoc DL(N);
28456
28457   // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28458   // of the shuffles in the chain so that we can form a fresh chain to replace
28459   // this one.
28460   SmallVector<SDValue, 8> Chain;
28461   SDValue V = N.getOperand(0);
28462   for (; V.hasOneUse(); V = V.getOperand(0)) {
28463     switch (V.getOpcode()) {
28464     default:
28465       return SDValue(); // Nothing combined!
28466
28467     case ISD::BITCAST:
28468       // Skip bitcasts as we always know the type for the target specific
28469       // instructions.
28470       continue;
28471
28472     case X86ISD::PSHUFD:
28473       // Found another dword shuffle.
28474       break;
28475
28476     case X86ISD::PSHUFLW:
28477       // Check that the low words (being shuffled) are the identity in the
28478       // dword shuffle, and the high words are self-contained.
28479       if (Mask[0] != 0 || Mask[1] != 1 ||
28480           !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28481         return SDValue();
28482
28483       Chain.push_back(V);
28484       continue;
28485
28486     case X86ISD::PSHUFHW:
28487       // Check that the high words (being shuffled) are the identity in the
28488       // dword shuffle, and the low words are self-contained.
28489       if (Mask[2] != 2 || Mask[3] != 3 ||
28490           !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28491         return SDValue();
28492
28493       Chain.push_back(V);
28494       continue;
28495
28496     case X86ISD::UNPCKL:
28497     case X86ISD::UNPCKH:
28498       // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28499       // shuffle into a preceding word shuffle.
28500       if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28501           V.getSimpleValueType().getVectorElementType() != MVT::i16)
28502         return SDValue();
28503
28504       // Search for a half-shuffle which we can combine with.
28505       unsigned CombineOp =
28506           V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28507       if (V.getOperand(0) != V.getOperand(1) ||
28508           !V->isOnlyUserOf(V.getOperand(0).getNode()))
28509         return SDValue();
28510       Chain.push_back(V);
28511       V = V.getOperand(0);
28512       do {
28513         switch (V.getOpcode()) {
28514         default:
28515           return SDValue(); // Nothing to combine.
28516
28517         case X86ISD::PSHUFLW:
28518         case X86ISD::PSHUFHW:
28519           if (V.getOpcode() == CombineOp)
28520             break;
28521
28522           Chain.push_back(V);
28523
28524           LLVM_FALLTHROUGH;
28525         case ISD::BITCAST:
28526           V = V.getOperand(0);
28527           continue;
28528         }
28529         break;
28530       } while (V.hasOneUse());
28531       break;
28532     }
28533     // Break out of the loop if we break out of the switch.
28534     break;
28535   }
28536
28537   if (!V.hasOneUse())
28538     // We fell out of the loop without finding a viable combining instruction.
28539     return SDValue();
28540
28541   // Merge this node's mask and our incoming mask.
28542   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28543   for (int &M : Mask)
28544     M = VMask[M];
28545   V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28546                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28547
28548   // Rebuild the chain around this new shuffle.
28549   while (!Chain.empty()) {
28550     SDValue W = Chain.pop_back_val();
28551
28552     if (V.getValueType() != W.getOperand(0).getValueType())
28553       V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28554
28555     switch (W.getOpcode()) {
28556     default:
28557       llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28558
28559     case X86ISD::UNPCKL:
28560     case X86ISD::UNPCKH:
28561       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28562       break;
28563
28564     case X86ISD::PSHUFD:
28565     case X86ISD::PSHUFLW:
28566     case X86ISD::PSHUFHW:
28567       V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28568       break;
28569     }
28570   }
28571   if (V.getValueType() != N.getValueType())
28572     V = DAG.getBitcast(N.getValueType(), V);
28573
28574   // Return the new chain to replace N.
28575   return V;
28576 }
28577
28578 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28579 /// pshufhw.
28580 ///
28581 /// We walk up the chain, skipping shuffles of the other half and looking
28582 /// through shuffles which switch halves trying to find a shuffle of the same
28583 /// pair of dwords.
28584 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28585                                         SelectionDAG &DAG,
28586                                         TargetLowering::DAGCombinerInfo &DCI) {
28587   assert(
28588       (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28589       "Called with something other than an x86 128-bit half shuffle!");
28590   SDLoc DL(N);
28591   unsigned CombineOpcode = N.getOpcode();
28592
28593   // Walk up a single-use chain looking for a combinable shuffle.
28594   SDValue V = N.getOperand(0);
28595   for (; V.hasOneUse(); V = V.getOperand(0)) {
28596     switch (V.getOpcode()) {
28597     default:
28598       return false; // Nothing combined!
28599
28600     case ISD::BITCAST:
28601       // Skip bitcasts as we always know the type for the target specific
28602       // instructions.
28603       continue;
28604
28605     case X86ISD::PSHUFLW:
28606     case X86ISD::PSHUFHW:
28607       if (V.getOpcode() == CombineOpcode)
28608         break;
28609
28610       // Other-half shuffles are no-ops.
28611       continue;
28612     }
28613     // Break out of the loop if we break out of the switch.
28614     break;
28615   }
28616
28617   if (!V.hasOneUse())
28618     // We fell out of the loop without finding a viable combining instruction.
28619     return false;
28620
28621   // Combine away the bottom node as its shuffle will be accumulated into
28622   // a preceding shuffle.
28623   DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28624
28625   // Record the old value.
28626   SDValue Old = V;
28627
28628   // Merge this node's mask and our incoming mask (adjusted to account for all
28629   // the pshufd instructions encountered).
28630   SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28631   for (int &M : Mask)
28632     M = VMask[M];
28633   V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28634                   getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28635
28636   // Check that the shuffles didn't cancel each other out. If not, we need to
28637   // combine to the new one.
28638   if (Old != V)
28639     // Replace the combinable shuffle with the combined one, updating all users
28640     // so that we re-evaluate the chain here.
28641     DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28642
28643   return true;
28644 }
28645
28646 /// \brief Try to combine x86 target specific shuffles.
28647 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28648                                     TargetLowering::DAGCombinerInfo &DCI,
28649                                     const X86Subtarget &Subtarget) {
28650   SDLoc DL(N);
28651   MVT VT = N.getSimpleValueType();
28652   SmallVector<int, 4> Mask;
28653
28654   unsigned Opcode = N.getOpcode();
28655   switch (Opcode) {
28656   case X86ISD::PSHUFD:
28657   case X86ISD::PSHUFLW:
28658   case X86ISD::PSHUFHW:
28659     Mask = getPSHUFShuffleMask(N);
28660     assert(Mask.size() == 4);
28661     break;
28662   case X86ISD::UNPCKL: {
28663     auto Op0 = N.getOperand(0);
28664     auto Op1 = N.getOperand(1);
28665     unsigned Opcode0 = Op0.getOpcode();
28666     unsigned Opcode1 = Op1.getOpcode();
28667
28668     // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28669     // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28670     // TODO: Add other horizontal operations as required.
28671     if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28672       return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28673
28674     // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28675     // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28676     // moves upper half elements into the lower half part. For example:
28677     //
28678     // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28679     //     undef:v16i8
28680     // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28681     //
28682     // will be combined to:
28683     //
28684     // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28685
28686     // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28687     // happen due to advanced instructions.
28688     if (!VT.is128BitVector())
28689       return SDValue();
28690
28691     if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28692       ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28693
28694       unsigned NumElts = VT.getVectorNumElements();
28695       SmallVector<int, 8> ExpectedMask(NumElts, -1);
28696       std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28697                 NumElts / 2);
28698
28699       auto ShufOp = Op1.getOperand(0);
28700       if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28701         return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28702     }
28703     return SDValue();
28704   }
28705   case X86ISD::BLENDI: {
28706     SDValue V0 = N->getOperand(0);
28707     SDValue V1 = N->getOperand(1);
28708     assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28709            "Unexpected input vector types");
28710
28711     // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28712     // operands and changing the mask to 1. This saves us a bunch of
28713     // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28714     // x86InstrInfo knows how to commute this back after instruction selection
28715     // if it would help register allocation.
28716
28717     // TODO: If optimizing for size or a processor that doesn't suffer from
28718     // partial register update stalls, this should be transformed into a MOVSD
28719     // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28720
28721     if (VT == MVT::v2f64)
28722       if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28723         if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28724           SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28725           return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28726         }
28727
28728     return SDValue();
28729   }
28730   case X86ISD::MOVSD:
28731   case X86ISD::MOVSS: {
28732     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28733     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28734     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28735     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28736     if (isZero0 && isZero1)
28737       return SDValue();
28738
28739     // We often lower to MOVSD/MOVSS from integer as well as native float
28740     // types; remove unnecessary domain-crossing bitcasts if we can to make it
28741     // easier to combine shuffles later on. We've already accounted for the
28742     // domain switching cost when we decided to lower with it.
28743     bool isFloat = VT.isFloatingPoint();
28744     bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28745     bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28746     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28747       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28748                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28749       V0 = DAG.getBitcast(NewVT, V0);
28750       V1 = DAG.getBitcast(NewVT, V1);
28751       return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28752     }
28753
28754     return SDValue();
28755   }
28756   case X86ISD::INSERTPS: {
28757     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28758     SDValue Op0 = N.getOperand(0);
28759     SDValue Op1 = N.getOperand(1);
28760     SDValue Op2 = N.getOperand(2);
28761     unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28762     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28763     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28764     unsigned ZeroMask = InsertPSMask & 0xF;
28765
28766     // If we zero out all elements from Op0 then we don't need to reference it.
28767     if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28768       return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28769                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28770
28771     // If we zero out the element from Op1 then we don't need to reference it.
28772     if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28773       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28774                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28775
28776     // Attempt to merge insertps Op1 with an inner target shuffle node.
28777     SmallVector<int, 8> TargetMask1;
28778     SmallVector<SDValue, 2> Ops1;
28779     if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28780       int M = TargetMask1[SrcIdx];
28781       if (isUndefOrZero(M)) {
28782         // Zero/UNDEF insertion - zero out element and remove dependency.
28783         InsertPSMask |= (1u << DstIdx);
28784         return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28785                            DAG.getConstant(InsertPSMask, DL, MVT::i8));
28786       }
28787       // Update insertps mask srcidx and reference the source input directly.
28788       assert(0 <= M && M < 8 && "Shuffle index out of range");
28789       InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28790       Op1 = Ops1[M < 4 ? 0 : 1];
28791       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28792                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28793     }
28794
28795     // Attempt to merge insertps Op0 with an inner target shuffle node.
28796     SmallVector<int, 8> TargetMask0;
28797     SmallVector<SDValue, 2> Ops0;
28798     if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28799       return SDValue();
28800
28801     bool Updated = false;
28802     bool UseInput00 = false;
28803     bool UseInput01 = false;
28804     for (int i = 0; i != 4; ++i) {
28805       int M = TargetMask0[i];
28806       if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28807         // No change if element is already zero or the inserted element.
28808         continue;
28809       } else if (isUndefOrZero(M)) {
28810         // If the target mask is undef/zero then we must zero the element.
28811         InsertPSMask |= (1u << i);
28812         Updated = true;
28813         continue;
28814       }
28815
28816       // The input vector element must be inline.
28817       if (M != i && M != (i + 4))
28818         return SDValue();
28819
28820       // Determine which inputs of the target shuffle we're using.
28821       UseInput00 |= (0 <= M && M < 4);
28822       UseInput01 |= (4 <= M);
28823     }
28824
28825     // If we're not using both inputs of the target shuffle then use the
28826     // referenced input directly.
28827     if (UseInput00 && !UseInput01) {
28828       Updated = true;
28829       Op0 = Ops0[0];
28830     } else if (!UseInput00 && UseInput01) {
28831       Updated = true;
28832       Op0 = Ops0[1];
28833     }
28834
28835     if (Updated)
28836       return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28837                          DAG.getConstant(InsertPSMask, DL, MVT::i8));
28838
28839     return SDValue();
28840   }
28841   default:
28842     return SDValue();
28843   }
28844
28845   // Nuke no-op shuffles that show up after combining.
28846   if (isNoopShuffleMask(Mask))
28847     return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28848
28849   // Look for simplifications involving one or two shuffle instructions.
28850   SDValue V = N.getOperand(0);
28851   switch (N.getOpcode()) {
28852   default:
28853     break;
28854   case X86ISD::PSHUFLW:
28855   case X86ISD::PSHUFHW:
28856     assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28857
28858     if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28859       return SDValue(); // We combined away this shuffle, so we're done.
28860
28861     // See if this reduces to a PSHUFD which is no more expensive and can
28862     // combine with more operations. Note that it has to at least flip the
28863     // dwords as otherwise it would have been removed as a no-op.
28864     if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28865       int DMask[] = {0, 1, 2, 3};
28866       int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28867       DMask[DOffset + 0] = DOffset + 1;
28868       DMask[DOffset + 1] = DOffset + 0;
28869       MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28870       V = DAG.getBitcast(DVT, V);
28871       DCI.AddToWorklist(V.getNode());
28872       V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28873                       getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28874       DCI.AddToWorklist(V.getNode());
28875       return DAG.getBitcast(VT, V);
28876     }
28877
28878     // Look for shuffle patterns which can be implemented as a single unpack.
28879     // FIXME: This doesn't handle the location of the PSHUFD generically, and
28880     // only works when we have a PSHUFD followed by two half-shuffles.
28881     if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28882         (V.getOpcode() == X86ISD::PSHUFLW ||
28883          V.getOpcode() == X86ISD::PSHUFHW) &&
28884         V.getOpcode() != N.getOpcode() &&
28885         V.hasOneUse()) {
28886       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28887       if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28888         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28889         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28890         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28891         int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28892         int WordMask[8];
28893         for (int i = 0; i < 4; ++i) {
28894           WordMask[i + NOffset] = Mask[i] + NOffset;
28895           WordMask[i + VOffset] = VMask[i] + VOffset;
28896         }
28897         // Map the word mask through the DWord mask.
28898         int MappedMask[8];
28899         for (int i = 0; i < 8; ++i)
28900           MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28901         if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28902             makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28903           // We can replace all three shuffles with an unpack.
28904           V = DAG.getBitcast(VT, D.getOperand(0));
28905           DCI.AddToWorklist(V.getNode());
28906           return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28907                                                 : X86ISD::UNPCKH,
28908                              DL, VT, V, V);
28909         }
28910       }
28911     }
28912
28913     break;
28914
28915   case X86ISD::PSHUFD:
28916     if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28917       return NewN;
28918
28919     break;
28920   }
28921
28922   return SDValue();
28923 }
28924
28925 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28926 /// operation. If true is returned then the operands of ADDSUB operation
28927 /// are written to the parameters \p Opnd0 and \p Opnd1.
28928 ///
28929 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28930 /// so it is easier to generically match. We also insert dummy vector shuffle
28931 /// nodes for the operands which explicitly discard the lanes which are unused
28932 /// by this operation to try to flow through the rest of the combiner
28933 /// the fact that they're unused.
28934 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28935                      SDValue &Opnd0, SDValue &Opnd1) {
28936
28937   EVT VT = N->getValueType(0);
28938   if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28939       (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28940       (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28941     return false;
28942
28943   // We only handle target-independent shuffles.
28944   // FIXME: It would be easy and harmless to use the target shuffle mask
28945   // extraction tool to support more.
28946   if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28947     return false;
28948
28949   ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28950   SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28951
28952   SDValue V1 = N->getOperand(0);
28953   SDValue V2 = N->getOperand(1);
28954
28955   // We require the first shuffle operand to be the FSUB node, and the second to
28956   // be the FADD node.
28957   if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28958     ShuffleVectorSDNode::commuteMask(Mask);
28959     std::swap(V1, V2);
28960   } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28961     return false;
28962
28963   // If there are other uses of these operations we can't fold them.
28964   if (!V1->hasOneUse() || !V2->hasOneUse())
28965     return false;
28966
28967   // Ensure that both operations have the same operands. Note that we can
28968   // commute the FADD operands.
28969   SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28970   if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28971       (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28972     return false;
28973
28974   // We're looking for blends between FADD and FSUB nodes. We insist on these
28975   // nodes being lined up in a specific expected pattern.
28976   if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28977         isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28978         isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28979         isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28980                                            8, 25, 10, 27, 12, 29, 14, 31})))
28981     return false;
28982
28983   Opnd0 = LHS;
28984   Opnd1 = RHS;
28985   return true;
28986 }
28987
28988 /// \brief Try to combine a shuffle into a target-specific add-sub or
28989 /// mul-add-sub node.
28990 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28991                                                 const X86Subtarget &Subtarget,
28992                                                 SelectionDAG &DAG) {
28993   SDValue Opnd0, Opnd1;
28994   if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28995     return SDValue();
28996
28997   EVT VT = N->getValueType(0);
28998   SDLoc DL(N);
28999
29000   // Try to generate X86ISD::FMADDSUB node here.
29001   SDValue Opnd2;
29002   if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
29003     return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
29004
29005   // Do not generate X86ISD::ADDSUB node for 512-bit types even though
29006   // the ADDSUB idiom has been successfully recognized. There are no known
29007   // X86 targets with 512-bit ADDSUB instructions!
29008   if (VT.is512BitVector())
29009     return SDValue();
29010
29011   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29012 }
29013
29014 // We are looking for a shuffle where both sources are concatenated with undef
29015 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29016 // if we can express this as a single-source shuffle, that's preferable.
29017 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29018                                            const X86Subtarget &Subtarget) {
29019   if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29020     return SDValue();
29021
29022   EVT VT = N->getValueType(0);
29023
29024   // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29025   if (!VT.is128BitVector() && !VT.is256BitVector())
29026     return SDValue();
29027
29028   if (VT.getVectorElementType() != MVT::i32 &&
29029       VT.getVectorElementType() != MVT::i64 &&
29030       VT.getVectorElementType() != MVT::f32 &&
29031       VT.getVectorElementType() != MVT::f64)
29032     return SDValue();
29033
29034   SDValue N0 = N->getOperand(0);
29035   SDValue N1 = N->getOperand(1);
29036
29037   // Check that both sources are concats with undef.
29038   if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29039       N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29040       N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29041       !N1.getOperand(1).isUndef())
29042     return SDValue();
29043
29044   // Construct the new shuffle mask. Elements from the first source retain their
29045   // index, but elements from the second source no longer need to skip an undef.
29046   SmallVector<int, 8> Mask;
29047   int NumElts = VT.getVectorNumElements();
29048
29049   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29050   for (int Elt : SVOp->getMask())
29051     Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29052
29053   SDLoc DL(N);
29054   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29055                                N1.getOperand(0));
29056   return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29057 }
29058
29059 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29060                               TargetLowering::DAGCombinerInfo &DCI,
29061                               const X86Subtarget &Subtarget) {
29062   SDLoc dl(N);
29063   EVT VT = N->getValueType(0);
29064   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29065   // If we have legalized the vector types, look for blends of FADD and FSUB
29066   // nodes that we can fuse into an ADDSUB node.
29067   if (TLI.isTypeLegal(VT))
29068     if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29069       return AddSub;
29070
29071   // During Type Legalization, when promoting illegal vector types,
29072   // the backend might introduce new shuffle dag nodes and bitcasts.
29073   //
29074   // This code performs the following transformation:
29075   // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29076   //       (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29077   //
29078   // We do this only if both the bitcast and the BINOP dag nodes have
29079   // one use. Also, perform this transformation only if the new binary
29080   // operation is legal. This is to avoid introducing dag nodes that
29081   // potentially need to be further expanded (or custom lowered) into a
29082   // less optimal sequence of dag nodes.
29083   if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29084       N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29085       N->getOperand(0).getOpcode() == ISD::BITCAST &&
29086       N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29087     SDValue N0 = N->getOperand(0);
29088     SDValue N1 = N->getOperand(1);
29089
29090     SDValue BC0 = N0.getOperand(0);
29091     EVT SVT = BC0.getValueType();
29092     unsigned Opcode = BC0.getOpcode();
29093     unsigned NumElts = VT.getVectorNumElements();
29094
29095     if (BC0.hasOneUse() && SVT.isVector() &&
29096         SVT.getVectorNumElements() * 2 == NumElts &&
29097         TLI.isOperationLegal(Opcode, VT)) {
29098       bool CanFold = false;
29099       switch (Opcode) {
29100       default : break;
29101       case ISD::ADD:
29102       case ISD::SUB:
29103       case ISD::MUL:
29104         // isOperationLegal lies for integer ops on floating point types.
29105         CanFold = VT.isInteger();
29106         break;
29107       case ISD::FADD:
29108       case ISD::FSUB:
29109       case ISD::FMUL:
29110         // isOperationLegal lies for floating point ops on integer types.
29111         CanFold = VT.isFloatingPoint();
29112         break;
29113       }
29114
29115       unsigned SVTNumElts = SVT.getVectorNumElements();
29116       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29117       for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29118         CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29119       for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29120         CanFold = SVOp->getMaskElt(i) < 0;
29121
29122       if (CanFold) {
29123         SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29124         SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29125         SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29126         return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29127       }
29128     }
29129   }
29130
29131   // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29132   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29133   // consecutive, non-overlapping, and in the right order.
29134   SmallVector<SDValue, 16> Elts;
29135   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29136     if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29137       Elts.push_back(Elt);
29138       continue;
29139     }
29140     Elts.clear();
29141     break;
29142   }
29143
29144   if (Elts.size() == VT.getVectorNumElements())
29145     if (SDValue LD =
29146             EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29147       return LD;
29148
29149   // For AVX2, we sometimes want to combine
29150   // (vector_shuffle <mask> (concat_vectors t1, undef)
29151   //                        (concat_vectors t2, undef))
29152   // Into:
29153   // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29154   // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29155   if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29156     return ShufConcat;
29157
29158   if (isTargetShuffle(N->getOpcode())) {
29159     SDValue Op(N, 0);
29160     if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29161       return Shuffle;
29162
29163     // Try recursively combining arbitrary sequences of x86 shuffle
29164     // instructions into higher-order shuffles. We do this after combining
29165     // specific PSHUF instruction sequences into their minimal form so that we
29166     // can evaluate how many specialized shuffle instructions are involved in
29167     // a particular chain.
29168     SmallVector<int, 1> NonceMask; // Just a placeholder.
29169     NonceMask.push_back(0);
29170     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
29171                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
29172                                       DCI, Subtarget))
29173       return SDValue(); // This routine will use CombineTo to replace N.
29174   }
29175
29176   return SDValue();
29177 }
29178
29179 /// Check if a vector extract from a target-specific shuffle of a load can be
29180 /// folded into a single element load.
29181 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29182 /// shuffles have been custom lowered so we need to handle those here.
29183 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29184                                          TargetLowering::DAGCombinerInfo &DCI) {
29185   if (DCI.isBeforeLegalizeOps())
29186     return SDValue();
29187
29188   SDValue InVec = N->getOperand(0);
29189   SDValue EltNo = N->getOperand(1);
29190   EVT EltVT = N->getValueType(0);
29191
29192   if (!isa<ConstantSDNode>(EltNo))
29193     return SDValue();
29194
29195   EVT OriginalVT = InVec.getValueType();
29196
29197   // Peek through bitcasts, don't duplicate a load with other uses.
29198   InVec = peekThroughOneUseBitcasts(InVec);
29199
29200   EVT CurrentVT = InVec.getValueType();
29201   if (!CurrentVT.isVector() ||
29202       CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29203     return SDValue();
29204
29205   if (!isTargetShuffle(InVec.getOpcode()))
29206     return SDValue();
29207
29208   // Don't duplicate a load with other uses.
29209   if (!InVec.hasOneUse())
29210     return SDValue();
29211
29212   SmallVector<int, 16> ShuffleMask;
29213   SmallVector<SDValue, 2> ShuffleOps;
29214   bool UnaryShuffle;
29215   if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29216                             ShuffleOps, ShuffleMask, UnaryShuffle))
29217     return SDValue();
29218
29219   // Select the input vector, guarding against out of range extract vector.
29220   unsigned NumElems = CurrentVT.getVectorNumElements();
29221   int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29222   int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29223
29224   if (Idx == SM_SentinelZero)
29225     return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29226                              : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29227   if (Idx == SM_SentinelUndef)
29228     return DAG.getUNDEF(EltVT);
29229
29230   assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29231   SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29232                                          : ShuffleOps[1];
29233
29234   // If inputs to shuffle are the same for both ops, then allow 2 uses
29235   unsigned AllowedUses =
29236       (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29237
29238   if (LdNode.getOpcode() == ISD::BITCAST) {
29239     // Don't duplicate a load with other uses.
29240     if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29241       return SDValue();
29242
29243     AllowedUses = 1; // only allow 1 load use if we have a bitcast
29244     LdNode = LdNode.getOperand(0);
29245   }
29246
29247   if (!ISD::isNormalLoad(LdNode.getNode()))
29248     return SDValue();
29249
29250   LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29251
29252   if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29253     return SDValue();
29254
29255   // If there's a bitcast before the shuffle, check if the load type and
29256   // alignment is valid.
29257   unsigned Align = LN0->getAlignment();
29258   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29259   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29260       EltVT.getTypeForEVT(*DAG.getContext()));
29261
29262   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29263     return SDValue();
29264
29265   // All checks match so transform back to vector_shuffle so that DAG combiner
29266   // can finish the job
29267   SDLoc dl(N);
29268
29269   // Create shuffle node taking into account the case that its a unary shuffle
29270   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29271   Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29272                                  ShuffleMask);
29273   Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29274   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29275                      EltNo);
29276 }
29277
29278 // Try to match patterns such as
29279 // (i16 bitcast (v16i1 x))
29280 // ->
29281 // (i16 movmsk (16i8 sext (v16i1 x)))
29282 // before the illegal vector is scalarized on subtargets that don't have legal
29283 // vxi1 types.
29284 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29285                                   const X86Subtarget &Subtarget) {
29286   EVT VT = BitCast.getValueType();
29287   SDValue N0 = BitCast.getOperand(0);
29288   EVT VecVT = N0->getValueType(0);
29289
29290   if (!VT.isScalarInteger() || !VecVT.isSimple())
29291     return SDValue();
29292
29293   // With AVX512 vxi1 types are legal and we prefer using k-regs.
29294   // MOVMSK is supported in SSE2 or later.
29295   if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
29296     return SDValue();
29297
29298   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29299   // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29300   // v8i16 and v16i16.
29301   // For these two cases, we can shuffle the upper element bytes to a
29302   // consecutive sequence at the start of the vector and treat the results as
29303   // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
29304   // for v16i16 this is not the case, because the shuffle is expensive, so we
29305   // avoid sign-extending to this type entirely.
29306   // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29307   // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29308   MVT SExtVT;
29309   MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29310   switch (VecVT.getSimpleVT().SimpleTy) {
29311   default:
29312     return SDValue();
29313   case MVT::v2i1:
29314     SExtVT = MVT::v2i64;
29315     FPCastVT = MVT::v2f64;
29316     break;
29317   case MVT::v4i1:
29318     SExtVT = MVT::v4i32;
29319     FPCastVT = MVT::v4f32;
29320     // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29321     // sign-extend to a 256-bit operation to avoid truncation.
29322     if (N0->getOpcode() == ISD::SETCC &&
29323         N0->getOperand(0)->getValueType(0).is256BitVector() &&
29324         Subtarget.hasInt256()) {
29325       SExtVT = MVT::v4i64;
29326       FPCastVT = MVT::v4f64;
29327     }
29328     break;
29329   case MVT::v8i1:
29330     SExtVT = MVT::v8i16;
29331     // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29332     // sign-extend to a 256-bit operation to match the compare.
29333     // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29334     // 256-bit because the shuffle is cheaper than sign extending the result of
29335     // the compare.
29336     if (N0->getOpcode() == ISD::SETCC &&
29337         N0->getOperand(0)->getValueType(0).is256BitVector() &&
29338         Subtarget.hasInt256()) {
29339       SExtVT = MVT::v8i32;
29340       FPCastVT = MVT::v8f32;
29341     }
29342     break;
29343   case MVT::v16i1:
29344     SExtVT = MVT::v16i8;
29345     // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29346     // it is not profitable to sign-extend to 256-bit because this will
29347     // require an extra cross-lane shuffle which is more expensive than
29348     // truncating the result of the compare to 128-bits.
29349     break;
29350   case MVT::v32i1:
29351     // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
29352     if (!Subtarget.hasInt256())
29353       return SDValue();
29354     SExtVT = MVT::v32i8;
29355     break;
29356   };
29357
29358   SDLoc DL(BitCast);
29359   SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29360   if (SExtVT == MVT::v8i16) {
29361     V = DAG.getBitcast(MVT::v16i8, V);
29362     V = DAG.getVectorShuffle(
29363         MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29364         {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29365   } else
29366     assert(SExtVT.getScalarType() != MVT::i16 &&
29367            "Vectors of i16 must be shuffled");
29368   if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29369     V = DAG.getBitcast(FPCastVT, V);
29370   V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29371   return DAG.getZExtOrTrunc(V, DL, VT);
29372 }
29373
29374 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29375                               TargetLowering::DAGCombinerInfo &DCI,
29376                               const X86Subtarget &Subtarget) {
29377   SDValue N0 = N->getOperand(0);
29378   EVT VT = N->getValueType(0);
29379   EVT SrcVT = N0.getValueType();
29380
29381   // Try to match patterns such as
29382   // (i16 bitcast (v16i1 x))
29383   // ->
29384   // (i16 movmsk (16i8 sext (v16i1 x)))
29385   // before the setcc result is scalarized on subtargets that don't have legal
29386   // vxi1 types.
29387   if (DCI.isBeforeLegalize())
29388     if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29389       return V;
29390   // Since MMX types are special and don't usually play with other vector types,
29391   // it's better to handle them early to be sure we emit efficient code by
29392   // avoiding store-load conversions.
29393
29394   // Detect bitcasts between i32 to x86mmx low word.
29395   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29396       SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29397     SDValue N00 = N0->getOperand(0);
29398     if (N00.getValueType() == MVT::i32)
29399       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29400   }
29401
29402   // Detect bitcasts between element or subvector extraction to x86mmx.
29403   if (VT == MVT::x86mmx &&
29404       (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29405        N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29406       isNullConstant(N0.getOperand(1))) {
29407     SDValue N00 = N0->getOperand(0);
29408     if (N00.getValueType().is128BitVector())
29409       return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29410                          DAG.getBitcast(MVT::v2i64, N00));
29411   }
29412
29413   // Detect bitcasts from FP_TO_SINT to x86mmx.
29414   if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29415       N0.getOpcode() == ISD::FP_TO_SINT) {
29416     SDLoc DL(N0);
29417     SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29418                               DAG.getUNDEF(MVT::v2i32));
29419     return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29420                        DAG.getBitcast(MVT::v2i64, Res));
29421   }
29422
29423   // Convert a bitcasted integer logic operation that has one bitcasted
29424   // floating-point operand into a floating-point logic operation. This may
29425   // create a load of a constant, but that is cheaper than materializing the
29426   // constant in an integer register and transferring it to an SSE register or
29427   // transferring the SSE operand to integer register and back.
29428   unsigned FPOpcode;
29429   switch (N0.getOpcode()) {
29430     case ISD::AND: FPOpcode = X86ISD::FAND; break;
29431     case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
29432     case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29433     default: return SDValue();
29434   }
29435
29436   if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29437         (Subtarget.hasSSE2() && VT == MVT::f64)))
29438     return SDValue();
29439
29440   SDValue LogicOp0 = N0.getOperand(0);
29441   SDValue LogicOp1 = N0.getOperand(1);
29442   SDLoc DL0(N0);
29443
29444   // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29445   if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29446       LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29447       !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29448     SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29449     return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29450   }
29451   // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29452   if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29453       LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29454       !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29455     SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29456     return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29457   }
29458
29459   return SDValue();
29460 }
29461
29462 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29463 // the elements of a vector.
29464 // Returns the vector that is being reduced on, or SDValue() if a reduction
29465 // was not matched.
29466 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29467   // The pattern must end in an extract from index 0.
29468   if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29469       !isNullConstant(Extract->getOperand(1)))
29470     return SDValue();
29471
29472   unsigned Stages =
29473       Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29474
29475   SDValue Op = Extract->getOperand(0);
29476   // At each stage, we're looking for something that looks like:
29477   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29478   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29479   //                               i32 undef, i32 undef, i32 undef, i32 undef>
29480   // %a = binop <8 x i32> %op, %s
29481   // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29482   // we expect something like:
29483   // <4,5,6,7,u,u,u,u>
29484   // <2,3,u,u,u,u,u,u>
29485   // <1,u,u,u,u,u,u,u>
29486   for (unsigned i = 0; i < Stages; ++i) {
29487     if (Op.getOpcode() != BinOp)
29488       return SDValue();
29489
29490     ShuffleVectorSDNode *Shuffle =
29491         dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29492     if (Shuffle) {
29493       Op = Op.getOperand(1);
29494     } else {
29495       Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29496       Op = Op.getOperand(0);
29497     }
29498
29499     // The first operand of the shuffle should be the same as the other operand
29500     // of the add.
29501     if (!Shuffle || (Shuffle->getOperand(0) != Op))
29502       return SDValue();
29503
29504     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29505     for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29506       if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29507         return SDValue();
29508   }
29509
29510   return Op;
29511 }
29512
29513 // Given a select, detect the following pattern:
29514 // 1:    %2 = zext <N x i8> %0 to <N x i32>
29515 // 2:    %3 = zext <N x i8> %1 to <N x i32>
29516 // 3:    %4 = sub nsw <N x i32> %2, %3
29517 // 4:    %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29518 // 5:    %6 = sub nsw <N x i32> zeroinitializer, %4
29519 // 6:    %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29520 // This is useful as it is the input into a SAD pattern.
29521 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29522                               SDValue &Op1) {
29523   // Check the condition of the select instruction is greater-than.
29524   SDValue SetCC = Select->getOperand(0);
29525   if (SetCC.getOpcode() != ISD::SETCC)
29526     return false;
29527   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29528   if (CC != ISD::SETGT && CC != ISD::SETLT)
29529     return false;
29530
29531   SDValue SelectOp1 = Select->getOperand(1);
29532   SDValue SelectOp2 = Select->getOperand(2);
29533
29534   // The following instructions assume SelectOp1 is the subtraction operand
29535   // and SelectOp2 is the negation operand.
29536   // In the case of SETLT this is the other way around.
29537   if (CC == ISD::SETLT)
29538     std::swap(SelectOp1, SelectOp2);
29539
29540   // The second operand of the select should be the negation of the first
29541   // operand, which is implemented as 0 - SelectOp1.
29542   if (!(SelectOp2.getOpcode() == ISD::SUB &&
29543         ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29544         SelectOp2.getOperand(1) == SelectOp1))
29545     return false;
29546
29547   // The first operand of SetCC is the first operand of the select, which is the
29548   // difference between the two input vectors.
29549   if (SetCC.getOperand(0) != SelectOp1)
29550     return false;
29551
29552   // In SetLT case, The second operand of the comparison can be either 1 or 0.
29553   APInt SplatVal;
29554   if ((CC == ISD::SETLT) &&
29555       !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal,
29556                                     /*AllowShrink*/false) &&
29557          SplatVal.isOneValue()) ||
29558         (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29559     return false;
29560
29561   // In SetGT case, The second operand of the comparison can be either -1 or 0.
29562   if ((CC == ISD::SETGT) &&
29563       !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29564         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29565     return false;
29566
29567   // The first operand of the select is the difference between the two input
29568   // vectors.
29569   if (SelectOp1.getOpcode() != ISD::SUB)
29570     return false;
29571
29572   Op0 = SelectOp1.getOperand(0);
29573   Op1 = SelectOp1.getOperand(1);
29574
29575   // Check if the operands of the sub are zero-extended from vectors of i8.
29576   if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29577       Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29578       Op1.getOpcode() != ISD::ZERO_EXTEND ||
29579       Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29580     return false;
29581
29582   return true;
29583 }
29584
29585 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29586 // to these zexts.
29587 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29588                             const SDValue &Zext1, const SDLoc &DL) {
29589
29590   // Find the appropriate width for the PSADBW.
29591   EVT InVT = Zext0.getOperand(0).getValueType();
29592   unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29593
29594   // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29595   // fill in the missing vector elements with 0.
29596   unsigned NumConcat = RegSize / InVT.getSizeInBits();
29597   SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29598   Ops[0] = Zext0.getOperand(0);
29599   MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29600   SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29601   Ops[0] = Zext1.getOperand(0);
29602   SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29603
29604   // Actually build the SAD
29605   MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29606   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29607 }
29608
29609 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29610 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29611                                                 SelectionDAG &DAG,
29612                                                 const X86Subtarget &Subtarget) {
29613   // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29614   if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29615     return SDValue();
29616
29617   EVT ExtractVT = Extract->getValueType(0);
29618   unsigned BitWidth = ExtractVT.getSizeInBits();
29619   if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29620       ExtractVT != MVT::i8)
29621     return SDValue();
29622
29623   // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29624   for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29625     SDValue Match = matchBinOpReduction(Extract, Op);
29626     if (!Match)
29627       continue;
29628
29629     // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29630     // which we can't support here for now.
29631     if (Match.getScalarValueSizeInBits() != BitWidth)
29632       continue;
29633
29634     // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29635     unsigned MatchSizeInBits = Match.getValueSizeInBits();
29636     if (!(MatchSizeInBits == 128 ||
29637           (MatchSizeInBits == 256 &&
29638            ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29639       return SDValue();
29640
29641     // Don't bother performing this for 2-element vectors.
29642     if (Match.getValueType().getVectorNumElements() <= 2)
29643       return SDValue();
29644
29645     // Check that we are extracting a reduction of all sign bits.
29646     if (DAG.ComputeNumSignBits(Match) != BitWidth)
29647       return SDValue();
29648
29649     // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29650     MVT MaskVT;
29651     if (64 == BitWidth || 32 == BitWidth)
29652       MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29653                                 MatchSizeInBits / BitWidth);
29654     else
29655       MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29656
29657     APInt CompareBits;
29658     ISD::CondCode CondCode;
29659     if (Op == ISD::OR) {
29660       // any_of -> MOVMSK != 0
29661       CompareBits = APInt::getNullValue(32);
29662       CondCode = ISD::CondCode::SETNE;
29663     } else {
29664       // all_of -> MOVMSK == ((1 << NumElts) - 1)
29665       CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29666       CondCode = ISD::CondCode::SETEQ;
29667     }
29668
29669     // Perform the select as i32/i64 and then truncate to avoid partial register
29670     // stalls.
29671     unsigned ResWidth = std::max(BitWidth, 32u);
29672     EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29673     SDLoc DL(Extract);
29674     SDValue Zero = DAG.getConstant(0, DL, ResVT);
29675     SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29676     SDValue Res = DAG.getBitcast(MaskVT, Match);
29677     Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29678     Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29679                           Ones, Zero, CondCode);
29680     return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29681   }
29682
29683   return SDValue();
29684 }
29685
29686 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29687                                       const X86Subtarget &Subtarget) {
29688   // PSADBW is only supported on SSE2 and up.
29689   if (!Subtarget.hasSSE2())
29690     return SDValue();
29691
29692   // Verify the type we're extracting from is any integer type above i16.
29693   EVT VT = Extract->getOperand(0).getValueType();
29694   if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29695     return SDValue();
29696
29697   unsigned RegSize = 128;
29698   if (Subtarget.hasBWI())
29699     RegSize = 512;
29700   else if (Subtarget.hasAVX2())
29701     RegSize = 256;
29702
29703   // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29704   // TODO: We should be able to handle larger vectors by splitting them before
29705   // feeding them into several SADs, and then reducing over those.
29706   if (RegSize / VT.getVectorNumElements() < 8)
29707     return SDValue();
29708
29709   // Match shuffle + add pyramid.
29710   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29711
29712   // The operand is expected to be zero extended from i8
29713   // (verified in detectZextAbsDiff).
29714   // In order to convert to i64 and above, additional any/zero/sign
29715   // extend is expected.
29716   // The zero extend from 32 bit has no mathematical effect on the result.
29717   // Also the sign extend is basically zero extend
29718   // (extends the sign bit which is zero).
29719   // So it is correct to skip the sign/zero extend instruction.
29720   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29721     Root.getOpcode() == ISD::ZERO_EXTEND ||
29722     Root.getOpcode() == ISD::ANY_EXTEND))
29723     Root = Root.getOperand(0);
29724
29725   // If there was a match, we want Root to be a select that is the root of an
29726   // abs-diff pattern.
29727   if (!Root || (Root.getOpcode() != ISD::VSELECT))
29728     return SDValue();
29729
29730   // Check whether we have an abs-diff pattern feeding into the select.
29731   SDValue Zext0, Zext1;
29732   if (!detectZextAbsDiff(Root, Zext0, Zext1))
29733     return SDValue();
29734
29735   // Create the SAD instruction.
29736   SDLoc DL(Extract);
29737   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29738
29739   // If the original vector was wider than 8 elements, sum over the results
29740   // in the SAD vector.
29741   unsigned Stages = Log2_32(VT.getVectorNumElements());
29742   MVT SadVT = SAD.getSimpleValueType();
29743   if (Stages > 3) {
29744     unsigned SadElems = SadVT.getVectorNumElements();
29745
29746     for(unsigned i = Stages - 3; i > 0; --i) {
29747       SmallVector<int, 16> Mask(SadElems, -1);
29748       for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29749         Mask[j] = MaskEnd + j;
29750
29751       SDValue Shuffle =
29752           DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29753       SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29754     }
29755   }
29756
29757   MVT Type = Extract->getSimpleValueType(0);
29758   unsigned TypeSizeInBits = Type.getSizeInBits();
29759   // Return the lowest TypeSizeInBits bits.
29760   MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29761   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29762   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29763                      Extract->getOperand(1));
29764 }
29765
29766 // Attempt to peek through a target shuffle and extract the scalar from the
29767 // source.
29768 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29769                                          TargetLowering::DAGCombinerInfo &DCI,
29770                                          const X86Subtarget &Subtarget) {
29771   if (DCI.isBeforeLegalizeOps())
29772     return SDValue();
29773
29774   SDValue Src = N->getOperand(0);
29775   SDValue Idx = N->getOperand(1);
29776
29777   EVT VT = N->getValueType(0);
29778   EVT SrcVT = Src.getValueType();
29779   EVT SrcSVT = SrcVT.getVectorElementType();
29780   unsigned NumSrcElts = SrcVT.getVectorNumElements();
29781
29782   // Don't attempt this for boolean mask vectors or unknown extraction indices.
29783   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29784     return SDValue();
29785
29786   // Resolve the target shuffle inputs and mask.
29787   SmallVector<int, 16> Mask;
29788   SmallVector<SDValue, 2> Ops;
29789   if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29790     return SDValue();
29791
29792   // Attempt to narrow/widen the shuffle mask to the correct size.
29793   if (Mask.size() != NumSrcElts) {
29794     if ((NumSrcElts % Mask.size()) == 0) {
29795       SmallVector<int, 16> ScaledMask;
29796       int Scale = NumSrcElts / Mask.size();
29797       scaleShuffleMask(Scale, Mask, ScaledMask);
29798       Mask = std::move(ScaledMask);
29799     } else if ((Mask.size() % NumSrcElts) == 0) {
29800       SmallVector<int, 16> WidenedMask;
29801       while (Mask.size() > NumSrcElts &&
29802              canWidenShuffleElements(Mask, WidenedMask))
29803         Mask = std::move(WidenedMask);
29804       // TODO - investigate support for wider shuffle masks with known upper
29805       // undef/zero elements for implicit zero-extension.
29806     }
29807   }
29808
29809   // Check if narrowing/widening failed.
29810   if (Mask.size() != NumSrcElts)
29811     return SDValue();
29812
29813   int SrcIdx = Mask[N->getConstantOperandVal(1)];
29814   SDLoc dl(N);
29815
29816   // If the shuffle source element is undef/zero then we can just accept it.
29817   if (SrcIdx == SM_SentinelUndef)
29818     return DAG.getUNDEF(VT);
29819
29820   if (SrcIdx == SM_SentinelZero)
29821     return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29822                                 : DAG.getConstant(0, dl, VT);
29823
29824   SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29825   SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29826   SrcIdx = SrcIdx % Mask.size();
29827
29828   // We can only extract other elements from 128-bit vectors and in certain
29829   // circumstances, depending on SSE-level.
29830   // TODO: Investigate using extract_subvector for larger vectors.
29831   // TODO: Investigate float/double extraction if it will be just stored.
29832   if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29833       ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29834     assert(SrcSVT == VT && "Unexpected extraction type");
29835     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29836                        DAG.getIntPtrConstant(SrcIdx, dl));
29837   }
29838
29839   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29840       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29841     assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29842            "Unexpected extraction type");
29843     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29844     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29845                                 DAG.getIntPtrConstant(SrcIdx, dl));
29846     SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29847                                  DAG.getValueType(SrcSVT));
29848     return DAG.getZExtOrTrunc(Assert, dl, VT);
29849   }
29850
29851   return SDValue();
29852 }
29853
29854 /// Detect vector gather/scatter index generation and convert it from being a
29855 /// bunch of shuffles and extracts into a somewhat faster sequence.
29856 /// For i686, the best sequence is apparently storing the value and loading
29857 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29858 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29859                                        TargetLowering::DAGCombinerInfo &DCI,
29860                                        const X86Subtarget &Subtarget) {
29861   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29862     return NewOp;
29863
29864   if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29865     return NewOp;
29866
29867   SDValue InputVector = N->getOperand(0);
29868   SDValue EltIdx = N->getOperand(1);
29869
29870   EVT SrcVT = InputVector.getValueType();
29871   EVT VT = N->getValueType(0);
29872   SDLoc dl(InputVector);
29873
29874   // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29875   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29876       VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29877     SDValue MMXSrc = InputVector.getOperand(0);
29878
29879     // The bitcast source is a direct mmx result.
29880     if (MMXSrc.getValueType() == MVT::x86mmx)
29881       return DAG.getBitcast(VT, InputVector);
29882   }
29883
29884   // Detect mmx to i32 conversion through a v2i32 elt extract.
29885   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29886       VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29887     SDValue MMXSrc = InputVector.getOperand(0);
29888
29889     // The bitcast source is a direct mmx result.
29890     if (MMXSrc.getValueType() == MVT::x86mmx)
29891       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29892   }
29893
29894   if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29895       isa<ConstantSDNode>(EltIdx) &&
29896       isa<ConstantSDNode>(InputVector.getOperand(0))) {
29897     uint64_t ExtractedElt = N->getConstantOperandVal(1);
29898     uint64_t InputValue = InputVector.getConstantOperandVal(0);
29899     uint64_t Res = (InputValue >> ExtractedElt) & 1;
29900     return DAG.getConstant(Res, dl, MVT::i1);
29901   }
29902
29903   // Check whether this extract is the root of a sum of absolute differences
29904   // pattern. This has to be done here because we really want it to happen
29905   // pre-legalization,
29906   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29907     return SAD;
29908
29909   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29910   if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29911     return Cmp;
29912
29913   // Only operate on vectors of 4 elements, where the alternative shuffling
29914   // gets to be more expensive.
29915   if (SrcVT != MVT::v4i32)
29916     return SDValue();
29917
29918   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29919   // single use which is a sign-extend or zero-extend, and all elements are
29920   // used.
29921   SmallVector<SDNode *, 4> Uses;
29922   unsigned ExtractedElements = 0;
29923   for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29924        UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29925     if (UI.getUse().getResNo() != InputVector.getResNo())
29926       return SDValue();
29927
29928     SDNode *Extract = *UI;
29929     if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29930       return SDValue();
29931
29932     if (Extract->getValueType(0) != MVT::i32)
29933       return SDValue();
29934     if (!Extract->hasOneUse())
29935       return SDValue();
29936     if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29937         Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29938       return SDValue();
29939     if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29940       return SDValue();
29941
29942     // Record which element was extracted.
29943     ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29944     Uses.push_back(Extract);
29945   }
29946
29947   // If not all the elements were used, this may not be worthwhile.
29948   if (ExtractedElements != 15)
29949     return SDValue();
29950
29951   // Ok, we've now decided to do the transformation.
29952   // If 64-bit shifts are legal, use the extract-shift sequence,
29953   // otherwise bounce the vector off the cache.
29954   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29955   SDValue Vals[4];
29956
29957   if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29958     SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29959     auto &DL = DAG.getDataLayout();
29960     EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29961     SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29962       DAG.getConstant(0, dl, VecIdxTy));
29963     SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29964       DAG.getConstant(1, dl, VecIdxTy));
29965
29966     SDValue ShAmt = DAG.getConstant(
29967         32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29968     Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29969     Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29970       DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29971     Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29972     Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29973       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29974   } else {
29975     // Store the value to a temporary stack slot.
29976     SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29977     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29978                               MachinePointerInfo());
29979
29980     EVT ElementType = SrcVT.getVectorElementType();
29981     unsigned EltSize = ElementType.getSizeInBits() / 8;
29982
29983     // Replace each use (extract) with a load of the appropriate element.
29984     for (unsigned i = 0; i < 4; ++i) {
29985       uint64_t Offset = EltSize * i;
29986       auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29987       SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29988
29989       SDValue ScalarAddr =
29990           DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29991
29992       // Load the scalar.
29993       Vals[i] =
29994           DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29995     }
29996   }
29997
29998   // Replace the extracts
29999   for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
30000     UE = Uses.end(); UI != UE; ++UI) {
30001     SDNode *Extract = *UI;
30002
30003     uint64_t IdxVal = Extract->getConstantOperandVal(1);
30004     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
30005   }
30006
30007   // The replacement was made in place; don't return anything.
30008   return SDValue();
30009 }
30010
30011 // TODO - merge with combineExtractVectorElt once it can handle the implicit
30012 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
30013 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30014 // combineBasicSADPattern.
30015 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
30016                                            TargetLowering::DAGCombinerInfo &DCI,
30017                                            const X86Subtarget &Subtarget) {
30018   return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
30019 }
30020
30021 /// If a vector select has an operand that is -1 or 0, try to simplify the
30022 /// select to a bitwise logic operation.
30023 static SDValue
30024 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30025                                  TargetLowering::DAGCombinerInfo &DCI,
30026                                  const X86Subtarget &Subtarget) {
30027   SDValue Cond = N->getOperand(0);
30028   SDValue LHS = N->getOperand(1);
30029   SDValue RHS = N->getOperand(2);
30030   EVT VT = LHS.getValueType();
30031   EVT CondVT = Cond.getValueType();
30032   SDLoc DL(N);
30033   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30034
30035   if (N->getOpcode() != ISD::VSELECT)
30036     return SDValue();
30037
30038   assert(CondVT.isVector() && "Vector select expects a vector selector!");
30039
30040   bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30041   // Check if the first operand is all zeros and Cond type is vXi1.
30042   // This situation only applies to avx512.
30043   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30044       CondVT.getVectorElementType() == MVT::i1) {
30045     // Invert the cond to not(cond) : xor(op,allones)=not(op)
30046     SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30047                                   DAG.getAllOnesConstant(DL, CondVT));
30048     // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30049     return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30050   }
30051
30052   // To use the condition operand as a bitwise mask, it must have elements that
30053   // are the same size as the select elements. Ie, the condition operand must
30054   // have already been promoted from the IR select condition type <N x i1>.
30055   // Don't check if the types themselves are equal because that excludes
30056   // vector floating-point selects.
30057   if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30058     return SDValue();
30059
30060   bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30061   FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30062
30063   // Try to invert the condition if true value is not all 1s and false value is
30064   // not all 0s.
30065   if (!TValIsAllOnes && !FValIsAllZeros &&
30066       // Check if the selector will be produced by CMPP*/PCMP*.
30067       Cond.getOpcode() == ISD::SETCC &&
30068       // Check if SETCC has already been promoted.
30069       TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30070           CondVT) {
30071     bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30072     bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30073
30074     if (TValIsAllZeros || FValIsAllOnes) {
30075       SDValue CC = Cond.getOperand(2);
30076       ISD::CondCode NewCC =
30077           ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30078                                Cond.getOperand(0).getValueType().isInteger());
30079       Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30080                           NewCC);
30081       std::swap(LHS, RHS);
30082       TValIsAllOnes = FValIsAllOnes;
30083       FValIsAllZeros = TValIsAllZeros;
30084     }
30085   }
30086
30087   // vselect Cond, 111..., 000... -> Cond
30088   if (TValIsAllOnes && FValIsAllZeros)
30089     return DAG.getBitcast(VT, Cond);
30090
30091   if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30092     return SDValue();
30093
30094   // vselect Cond, 111..., X -> or Cond, X
30095   if (TValIsAllOnes) {
30096     SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30097     SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30098     return DAG.getBitcast(VT, Or);
30099   }
30100
30101   // vselect Cond, X, 000... -> and Cond, X
30102   if (FValIsAllZeros) {
30103     SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30104     SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30105     return DAG.getBitcast(VT, And);
30106   }
30107
30108   return SDValue();
30109 }
30110
30111 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30112   SDValue Cond = N->getOperand(0);
30113   SDValue LHS = N->getOperand(1);
30114   SDValue RHS = N->getOperand(2);
30115   SDLoc DL(N);
30116
30117   auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30118   auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30119   if (!TrueC || !FalseC)
30120     return SDValue();
30121
30122   // Don't do this for crazy integer types.
30123   if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
30124     return SDValue();
30125
30126   // If this is efficiently invertible, canonicalize the LHSC/RHSC values
30127   // so that TrueC (the true value) is larger than FalseC.
30128   bool NeedsCondInvert = false;
30129   if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
30130       // Efficiently invertible.
30131       (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
30132        (Cond.getOpcode() == ISD::XOR &&  // xor(X, C) -> invertible.
30133         isa<ConstantSDNode>(Cond.getOperand(1))))) {
30134     NeedsCondInvert = true;
30135     std::swap(TrueC, FalseC);
30136   }
30137
30138   // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
30139   if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30140     if (NeedsCondInvert) // Invert the condition if needed.
30141       Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30142                          DAG.getConstant(1, DL, Cond.getValueType()));
30143
30144     // Zero extend the condition if needed.
30145     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
30146
30147     unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30148     return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
30149                        DAG.getConstant(ShAmt, DL, MVT::i8));
30150   }
30151
30152   // Optimize cases that will turn into an LEA instruction.  This requires
30153   // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30154   if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30155     uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
30156     if (N->getValueType(0) == MVT::i32)
30157       Diff = (unsigned)Diff;
30158
30159     bool IsFastMultiplier = false;
30160     if (Diff < 10) {
30161       switch ((unsigned char)Diff) {
30162       default:
30163         break;
30164       case 1: // result = add base, cond
30165       case 2: // result = lea base(    , cond*2)
30166       case 3: // result = lea base(cond, cond*2)
30167       case 4: // result = lea base(    , cond*4)
30168       case 5: // result = lea base(cond, cond*4)
30169       case 8: // result = lea base(    , cond*8)
30170       case 9: // result = lea base(cond, cond*8)
30171         IsFastMultiplier = true;
30172         break;
30173       }
30174     }
30175
30176     if (IsFastMultiplier) {
30177       APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
30178       if (NeedsCondInvert) // Invert the condition if needed.
30179         Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30180                            DAG.getConstant(1, DL, Cond.getValueType()));
30181
30182       // Zero extend the condition if needed.
30183       Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
30184       // Scale the condition by the difference.
30185       if (Diff != 1)
30186         Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30187                            DAG.getConstant(Diff, DL, Cond.getValueType()));
30188
30189       // Add the base if non-zero.
30190       if (FalseC->getAPIntValue() != 0)
30191         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30192                            SDValue(FalseC, 0));
30193       return Cond;
30194     }
30195   }
30196
30197   return SDValue();
30198 }
30199
30200 // If this is a bitcasted op that can be represented as another type, push the
30201 // the bitcast to the inputs. This allows more opportunities for pattern
30202 // matching masked instructions. This is called when we know that the operation
30203 // is used as one of the inputs of a vselect.
30204 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30205                                       TargetLowering::DAGCombinerInfo &DCI) {
30206   // Make sure we have a bitcast.
30207   if (OrigOp.getOpcode() != ISD::BITCAST)
30208     return false;
30209
30210   SDValue Op = OrigOp.getOperand(0);
30211
30212   // If the operation is used by anything other than the bitcast, we shouldn't
30213   // do this combine as that would replicate the operation.
30214   if (!Op.hasOneUse())
30215     return false;
30216
30217   MVT VT = OrigOp.getSimpleValueType();
30218   MVT EltVT = VT.getVectorElementType();
30219   SDLoc DL(Op.getNode());
30220
30221   auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30222                                       SDValue Op2) {
30223     Op0 = DAG.getBitcast(VT, Op0);
30224     DCI.AddToWorklist(Op0.getNode());
30225     Op1 = DAG.getBitcast(VT, Op1);
30226     DCI.AddToWorklist(Op1.getNode());
30227     DCI.CombineTo(OrigOp.getNode(),
30228                   DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30229     return true;
30230   };
30231
30232   unsigned Opcode = Op.getOpcode();
30233   switch (Opcode) {
30234   case X86ISD::PALIGNR:
30235     // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30236     if (!VT.is128BitVector())
30237       return false;
30238     Opcode = X86ISD::VALIGN;
30239     LLVM_FALLTHROUGH;
30240   case X86ISD::VALIGN: {
30241     if (EltVT != MVT::i32 && EltVT != MVT::i64)
30242       return false;
30243     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30244     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30245     unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30246     unsigned EltSize = EltVT.getSizeInBits();
30247     // Make sure we can represent the same shift with the new VT.
30248     if ((ShiftAmt % EltSize) != 0)
30249       return false;
30250     Imm = ShiftAmt / EltSize;
30251     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30252                                     DAG.getConstant(Imm, DL, MVT::i8));
30253   }
30254   case X86ISD::SHUF128: {
30255     if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30256       return false;
30257     // Only change element size, not type.
30258     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30259       return false;
30260     return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30261                                     Op.getOperand(2));
30262   }
30263   case ISD::INSERT_SUBVECTOR: {
30264     unsigned EltSize = EltVT.getSizeInBits();
30265     if (EltSize != 32 && EltSize != 64)
30266       return false;
30267     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30268     // Only change element size, not type.
30269     if (EltVT.isInteger() != OpEltVT.isInteger())
30270       return false;
30271     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30272     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30273     SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30274     DCI.AddToWorklist(Op0.getNode());
30275     // Op1 needs to be bitcasted to a smaller vector with the same element type.
30276     SDValue Op1 = Op.getOperand(1);
30277     MVT Op1VT = MVT::getVectorVT(EltVT,
30278                             Op1.getSimpleValueType().getSizeInBits() / EltSize);
30279     Op1 = DAG.getBitcast(Op1VT, Op1);
30280     DCI.AddToWorklist(Op1.getNode());
30281     DCI.CombineTo(OrigOp.getNode(),
30282                   DAG.getNode(Opcode, DL, VT, Op0, Op1,
30283                               DAG.getIntPtrConstant(Imm, DL)));
30284     return true;
30285   }
30286   case ISD::EXTRACT_SUBVECTOR: {
30287     unsigned EltSize = EltVT.getSizeInBits();
30288     if (EltSize != 32 && EltSize != 64)
30289       return false;
30290     MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30291     // Only change element size, not type.
30292     if (EltVT.isInteger() != OpEltVT.isInteger())
30293       return false;
30294     uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
30295     Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30296     // Op0 needs to be bitcasted to a larger vector with the same element type.
30297     SDValue Op0 = Op.getOperand(0);
30298     MVT Op0VT = MVT::getVectorVT(EltVT,
30299                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
30300     Op0 = DAG.getBitcast(Op0VT, Op0);
30301     DCI.AddToWorklist(Op0.getNode());
30302     DCI.CombineTo(OrigOp.getNode(),
30303                   DAG.getNode(Opcode, DL, VT, Op0,
30304                               DAG.getIntPtrConstant(Imm, DL)));
30305     return true;
30306   }
30307   case X86ISD::SUBV_BROADCAST: {
30308     unsigned EltSize = EltVT.getSizeInBits();
30309     if (EltSize != 32 && EltSize != 64)
30310       return false;
30311     // Only change element size, not type.
30312     if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30313       return false;
30314     SDValue Op0 = Op.getOperand(0);
30315     MVT Op0VT = MVT::getVectorVT(EltVT,
30316                             Op0.getSimpleValueType().getSizeInBits() / EltSize);
30317     Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30318     DCI.AddToWorklist(Op0.getNode());
30319     DCI.CombineTo(OrigOp.getNode(),
30320                   DAG.getNode(Opcode, DL, VT, Op0));
30321     return true;
30322   }
30323   }
30324
30325   return false;
30326 }
30327
30328 /// Do target-specific dag combines on SELECT and VSELECT nodes.
30329 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30330                              TargetLowering::DAGCombinerInfo &DCI,
30331                              const X86Subtarget &Subtarget) {
30332   SDLoc DL(N);
30333   SDValue Cond = N->getOperand(0);
30334   // Get the LHS/RHS of the select.
30335   SDValue LHS = N->getOperand(1);
30336   SDValue RHS = N->getOperand(2);
30337   EVT VT = LHS.getValueType();
30338   EVT CondVT = Cond.getValueType();
30339   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30340
30341   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
30342   // instructions match the semantics of the common C idiom x<y?x:y but not
30343   // x<=y?x:y, because of how they handle negative zero (which can be
30344   // ignored in unsafe-math mode).
30345   // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30346   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
30347       VT != MVT::f80 && VT != MVT::f128 &&
30348       (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
30349       (Subtarget.hasSSE2() ||
30350        (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
30351     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30352
30353     unsigned Opcode = 0;
30354     // Check for x CC y ? x : y.
30355     if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30356         DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30357       switch (CC) {
30358       default: break;
30359       case ISD::SETULT:
30360         // Converting this to a min would handle NaNs incorrectly, and swapping
30361         // the operands would cause it to handle comparisons between positive
30362         // and negative zero incorrectly.
30363         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30364           if (!DAG.getTarget().Options.UnsafeFPMath &&
30365               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30366             break;
30367           std::swap(LHS, RHS);
30368         }
30369         Opcode = X86ISD::FMIN;
30370         break;
30371       case ISD::SETOLE:
30372         // Converting this to a min would handle comparisons between positive
30373         // and negative zero incorrectly.
30374         if (!DAG.getTarget().Options.UnsafeFPMath &&
30375             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30376           break;
30377         Opcode = X86ISD::FMIN;
30378         break;
30379       case ISD::SETULE:
30380         // Converting this to a min would handle both negative zeros and NaNs
30381         // incorrectly, but we can swap the operands to fix both.
30382         std::swap(LHS, RHS);
30383         LLVM_FALLTHROUGH;
30384       case ISD::SETOLT:
30385       case ISD::SETLT:
30386       case ISD::SETLE:
30387         Opcode = X86ISD::FMIN;
30388         break;
30389
30390       case ISD::SETOGE:
30391         // Converting this to a max would handle comparisons between positive
30392         // and negative zero incorrectly.
30393         if (!DAG.getTarget().Options.UnsafeFPMath &&
30394             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30395           break;
30396         Opcode = X86ISD::FMAX;
30397         break;
30398       case ISD::SETUGT:
30399         // Converting this to a max would handle NaNs incorrectly, and swapping
30400         // the operands would cause it to handle comparisons between positive
30401         // and negative zero incorrectly.
30402         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30403           if (!DAG.getTarget().Options.UnsafeFPMath &&
30404               !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30405             break;
30406           std::swap(LHS, RHS);
30407         }
30408         Opcode = X86ISD::FMAX;
30409         break;
30410       case ISD::SETUGE:
30411         // Converting this to a max would handle both negative zeros and NaNs
30412         // incorrectly, but we can swap the operands to fix both.
30413         std::swap(LHS, RHS);
30414         LLVM_FALLTHROUGH;
30415       case ISD::SETOGT:
30416       case ISD::SETGT:
30417       case ISD::SETGE:
30418         Opcode = X86ISD::FMAX;
30419         break;
30420       }
30421     // Check for x CC y ? y : x -- a min/max with reversed arms.
30422     } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30423                DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30424       switch (CC) {
30425       default: break;
30426       case ISD::SETOGE:
30427         // Converting this to a min would handle comparisons between positive
30428         // and negative zero incorrectly, and swapping the operands would
30429         // cause it to handle NaNs incorrectly.
30430         if (!DAG.getTarget().Options.UnsafeFPMath &&
30431             !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30432           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30433             break;
30434           std::swap(LHS, RHS);
30435         }
30436         Opcode = X86ISD::FMIN;
30437         break;
30438       case ISD::SETUGT:
30439         // Converting this to a min would handle NaNs incorrectly.
30440         if (!DAG.getTarget().Options.UnsafeFPMath &&
30441             (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30442           break;
30443         Opcode = X86ISD::FMIN;
30444         break;
30445       case ISD::SETUGE:
30446         // Converting this to a min would handle both negative zeros and NaNs
30447         // incorrectly, but we can swap the operands to fix both.
30448         std::swap(LHS, RHS);
30449         LLVM_FALLTHROUGH;
30450       case ISD::SETOGT:
30451       case ISD::SETGT:
30452       case ISD::SETGE:
30453         Opcode = X86ISD::FMIN;
30454         break;
30455
30456       case ISD::SETULT:
30457         // Converting this to a max would handle NaNs incorrectly.
30458         if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30459           break;
30460         Opcode = X86ISD::FMAX;
30461         break;
30462       case ISD::SETOLE:
30463         // Converting this to a max would handle comparisons between positive
30464         // and negative zero incorrectly, and swapping the operands would
30465         // cause it to handle NaNs incorrectly.
30466         if (!DAG.getTarget().Options.UnsafeFPMath &&
30467             !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30468           if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30469             break;
30470           std::swap(LHS, RHS);
30471         }
30472         Opcode = X86ISD::FMAX;
30473         break;
30474       case ISD::SETULE:
30475         // Converting this to a max would handle both negative zeros and NaNs
30476         // incorrectly, but we can swap the operands to fix both.
30477         std::swap(LHS, RHS);
30478         LLVM_FALLTHROUGH;
30479       case ISD::SETOLT:
30480       case ISD::SETLT:
30481       case ISD::SETLE:
30482         Opcode = X86ISD::FMAX;
30483         break;
30484       }
30485     }
30486
30487     if (Opcode)
30488       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30489   }
30490
30491   // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30492   // lowering on KNL. In this case we convert it to
30493   // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30494   // The same situation for all 128 and 256-bit vectors of i8 and i16.
30495   // Since SKX these selects have a proper lowering.
30496   if (Subtarget.hasAVX512() && CondVT.isVector() &&
30497       CondVT.getVectorElementType() == MVT::i1 &&
30498       (VT.is128BitVector() || VT.is256BitVector()) &&
30499       (VT.getVectorElementType() == MVT::i8 ||
30500        VT.getVectorElementType() == MVT::i16) &&
30501       !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30502     Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30503     DCI.AddToWorklist(Cond.getNode());
30504     return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30505   }
30506
30507   if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30508     return V;
30509
30510   // Canonicalize max and min:
30511   // (x > y) ? x : y -> (x >= y) ? x : y
30512   // (x < y) ? x : y -> (x <= y) ? x : y
30513   // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30514   // the need for an extra compare
30515   // against zero. e.g.
30516   // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30517   // subl   %esi, %edi
30518   // testl  %edi, %edi
30519   // movl   $0, %eax
30520   // cmovgl %edi, %eax
30521   // =>
30522   // xorl   %eax, %eax
30523   // subl   %esi, $edi
30524   // cmovsl %eax, %edi
30525   if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30526       DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30527       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30528     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30529     switch (CC) {
30530     default: break;
30531     case ISD::SETLT:
30532     case ISD::SETGT: {
30533       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30534       Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30535                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
30536       return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30537     }
30538     }
30539   }
30540
30541   // Early exit check
30542   if (!TLI.isTypeLegal(VT))
30543     return SDValue();
30544
30545   // Match VSELECTs into subs with unsigned saturation.
30546   if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30547       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30548       ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30549        (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30550     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30551
30552     // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30553     // left side invert the predicate to simplify logic below.
30554     SDValue Other;
30555     if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30556       Other = RHS;
30557       CC = ISD::getSetCCInverse(CC, true);
30558     } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30559       Other = LHS;
30560     }
30561
30562     if (Other.getNode() && Other->getNumOperands() == 2 &&
30563         DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30564       SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30565       SDValue CondRHS = Cond->getOperand(1);
30566
30567       // Look for a general sub with unsigned saturation first.
30568       // x >= y ? x-y : 0 --> subus x, y
30569       // x >  y ? x-y : 0 --> subus x, y
30570       if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30571           Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30572         return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30573
30574       if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30575         if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30576           if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30577             if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30578               // If the RHS is a constant we have to reverse the const
30579               // canonicalization.
30580               // x > C-1 ? x+-C : 0 --> subus x, C
30581               if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30582                   CondRHSConst->getAPIntValue() ==
30583                       (-OpRHSConst->getAPIntValue() - 1))
30584                 return DAG.getNode(
30585                     X86ISD::SUBUS, DL, VT, OpLHS,
30586                     DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30587
30588           // Another special case: If C was a sign bit, the sub has been
30589           // canonicalized into a xor.
30590           // FIXME: Would it be better to use computeKnownBits to determine
30591           //        whether it's safe to decanonicalize the xor?
30592           // x s< 0 ? x^C : 0 --> subus x, C
30593           if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30594               ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30595               OpRHSConst->getAPIntValue().isSignMask())
30596             // Note that we have to rebuild the RHS constant here to ensure we
30597             // don't rely on particular values of undef lanes.
30598             return DAG.getNode(
30599                 X86ISD::SUBUS, DL, VT, OpLHS,
30600                 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30601         }
30602     }
30603   }
30604
30605   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30606     return V;
30607
30608   // If this is a *dynamic* select (non-constant condition) and we can match
30609   // this node with one of the variable blend instructions, restructure the
30610   // condition so that blends can use the high (sign) bit of each element and
30611   // use SimplifyDemandedBits to simplify the condition operand.
30612   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30613       !DCI.isBeforeLegalize() &&
30614       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30615     unsigned BitWidth = Cond.getScalarValueSizeInBits();
30616
30617     // Don't optimize vector selects that map to mask-registers.
30618     if (BitWidth == 1)
30619       return SDValue();
30620
30621     // We can only handle the cases where VSELECT is directly legal on the
30622     // subtarget. We custom lower VSELECT nodes with constant conditions and
30623     // this makes it hard to see whether a dynamic VSELECT will correctly
30624     // lower, so we both check the operation's status and explicitly handle the
30625     // cases where a *dynamic* blend will fail even though a constant-condition
30626     // blend could be custom lowered.
30627     // FIXME: We should find a better way to handle this class of problems.
30628     // Potentially, we should combine constant-condition vselect nodes
30629     // pre-legalization into shuffles and not mark as many types as custom
30630     // lowered.
30631     if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30632       return SDValue();
30633     // FIXME: We don't support i16-element blends currently. We could and
30634     // should support them by making *all* the bits in the condition be set
30635     // rather than just the high bit and using an i8-element blend.
30636     if (VT.getVectorElementType() == MVT::i16)
30637       return SDValue();
30638     // Dynamic blending was only available from SSE4.1 onward.
30639     if (VT.is128BitVector() && !Subtarget.hasSSE41())
30640       return SDValue();
30641     // Byte blends are only available in AVX2
30642     if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30643       return SDValue();
30644     // There are no 512-bit blend instructions that use sign bits.
30645     if (VT.is512BitVector())
30646       return SDValue();
30647
30648     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30649     APInt DemandedMask(APInt::getSignMask(BitWidth));
30650     KnownBits Known;
30651     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
30652                                           !DCI.isBeforeLegalizeOps());
30653     if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30654         TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30655       // If we changed the computation somewhere in the DAG, this change will
30656       // affect all users of Cond. Make sure it is fine and update all the nodes
30657       // so that we do not use the generic VSELECT anymore. Otherwise, we may
30658       // perform wrong optimizations as we messed with the actual expectation
30659       // for the vector boolean values.
30660       if (Cond != TLO.Old) {
30661         // Check all uses of the condition operand to check whether it will be
30662         // consumed by non-BLEND instructions. Those may require that all bits
30663         // are set properly.
30664         for (SDNode *U : Cond->uses()) {
30665           // TODO: Add other opcodes eventually lowered into BLEND.
30666           if (U->getOpcode() != ISD::VSELECT)
30667             return SDValue();
30668         }
30669
30670         // Update all users of the condition before committing the change, so
30671         // that the VSELECT optimizations that expect the correct vector boolean
30672         // value will not be triggered.
30673         for (SDNode *U : Cond->uses()) {
30674           SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30675                                    U->getValueType(0), Cond, U->getOperand(1),
30676                                    U->getOperand(2));
30677           DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30678         }
30679         DCI.CommitTargetLoweringOpt(TLO);
30680         return SDValue();
30681       }
30682       // Only Cond (rather than other nodes in the computation chain) was
30683       // changed. Change the condition just for N to keep the opportunity to
30684       // optimize all other users their own way.
30685       SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30686       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30687       return SDValue();
30688     }
30689   }
30690
30691   // Look for vselects with LHS/RHS being bitcasted from an operation that
30692   // can be executed on another type. Push the bitcast to the inputs of
30693   // the operation. This exposes opportunities for using masking instructions.
30694   if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30695       CondVT.getVectorElementType() == MVT::i1) {
30696     if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30697       return SDValue(N, 0);
30698     if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30699       return SDValue(N, 0);
30700   }
30701
30702   // Custom action for SELECT MMX
30703   if (VT == MVT::x86mmx) {
30704     LHS = DAG.getBitcast(MVT::i64, LHS);
30705     RHS = DAG.getBitcast(MVT::i64, RHS);
30706     SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
30707     return DAG.getBitcast(VT, newSelect);
30708   }
30709
30710   return SDValue();
30711 }
30712
30713 /// Combine:
30714 ///   (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30715 /// to:
30716 ///   (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30717 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30718 /// Note that this is only legal for some op/cc combinations.
30719 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30720                                        SelectionDAG &DAG) {
30721   // This combine only operates on CMP-like nodes.
30722   if (!(Cmp.getOpcode() == X86ISD::CMP ||
30723         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30724     return SDValue();
30725
30726   // Can't replace the cmp if it has more uses than the one we're looking at.
30727   // FIXME: We would like to be able to handle this, but would need to make sure
30728   // all uses were updated.
30729   if (!Cmp.hasOneUse())
30730     return SDValue();
30731
30732   // This only applies to variations of the common case:
30733   //   (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30734   //   (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30735   //   (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30736   //   (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30737   // Using the proper condcodes (see below), overflow is checked for.
30738
30739   // FIXME: We can generalize both constraints:
30740   // - XOR/OR/AND (if they were made to survive AtomicExpand)
30741   // - LHS != 1
30742   // if the result is compared.
30743
30744   SDValue CmpLHS = Cmp.getOperand(0);
30745   SDValue CmpRHS = Cmp.getOperand(1);
30746
30747   if (!CmpLHS.hasOneUse())
30748     return SDValue();
30749
30750   auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30751   if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30752     return SDValue();
30753
30754   const unsigned Opc = CmpLHS.getOpcode();
30755
30756   if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30757     return SDValue();
30758
30759   SDValue OpRHS = CmpLHS.getOperand(2);
30760   auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30761   if (!OpRHSC)
30762     return SDValue();
30763
30764   APInt Addend = OpRHSC->getAPIntValue();
30765   if (Opc == ISD::ATOMIC_LOAD_SUB)
30766     Addend = -Addend;
30767
30768   if (CC == X86::COND_S && Addend == 1)
30769     CC = X86::COND_LE;
30770   else if (CC == X86::COND_NS && Addend == 1)
30771     CC = X86::COND_G;
30772   else if (CC == X86::COND_G && Addend == -1)
30773     CC = X86::COND_GE;
30774   else if (CC == X86::COND_LE && Addend == -1)
30775     CC = X86::COND_L;
30776   else
30777     return SDValue();
30778
30779   SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30780   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30781                                 DAG.getUNDEF(CmpLHS.getValueType()));
30782   DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30783   return LockOp;
30784 }
30785
30786 // Check whether a boolean test is testing a boolean value generated by
30787 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30788 // code.
30789 //
30790 // Simplify the following patterns:
30791 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30792 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30793 // to (Op EFLAGS Cond)
30794 //
30795 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30796 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30797 // to (Op EFLAGS !Cond)
30798 //
30799 // where Op could be BRCOND or CMOV.
30800 //
30801 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30802   // This combine only operates on CMP-like nodes.
30803   if (!(Cmp.getOpcode() == X86ISD::CMP ||
30804         (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30805     return SDValue();
30806
30807   // Quit if not used as a boolean value.
30808   if (CC != X86::COND_E && CC != X86::COND_NE)
30809     return SDValue();
30810
30811   // Check CMP operands. One of them should be 0 or 1 and the other should be
30812   // an SetCC or extended from it.
30813   SDValue Op1 = Cmp.getOperand(0);
30814   SDValue Op2 = Cmp.getOperand(1);
30815
30816   SDValue SetCC;
30817   const ConstantSDNode* C = nullptr;
30818   bool needOppositeCond = (CC == X86::COND_E);
30819   bool checkAgainstTrue = false; // Is it a comparison against 1?
30820
30821   if ((C = dyn_cast<ConstantSDNode>(Op1)))
30822     SetCC = Op2;
30823   else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30824     SetCC = Op1;
30825   else // Quit if all operands are not constants.
30826     return SDValue();
30827
30828   if (C->getZExtValue() == 1) {
30829     needOppositeCond = !needOppositeCond;
30830     checkAgainstTrue = true;
30831   } else if (C->getZExtValue() != 0)
30832     // Quit if the constant is neither 0 or 1.
30833     return SDValue();
30834
30835   bool truncatedToBoolWithAnd = false;
30836   // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30837   while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30838          SetCC.getOpcode() == ISD::TRUNCATE ||
30839          SetCC.getOpcode() == ISD::AND) {
30840     if (SetCC.getOpcode() == ISD::AND) {
30841       int OpIdx = -1;
30842       if (isOneConstant(SetCC.getOperand(0)))
30843         OpIdx = 1;
30844       if (isOneConstant(SetCC.getOperand(1)))
30845         OpIdx = 0;
30846       if (OpIdx < 0)
30847         break;
30848       SetCC = SetCC.getOperand(OpIdx);
30849       truncatedToBoolWithAnd = true;
30850     } else
30851       SetCC = SetCC.getOperand(0);
30852   }
30853
30854   switch (SetCC.getOpcode()) {
30855   case X86ISD::SETCC_CARRY:
30856     // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30857     // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30858     // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30859     // truncated to i1 using 'and'.
30860     if (checkAgainstTrue && !truncatedToBoolWithAnd)
30861       break;
30862     assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30863            "Invalid use of SETCC_CARRY!");
30864     LLVM_FALLTHROUGH;
30865   case X86ISD::SETCC:
30866     // Set the condition code or opposite one if necessary.
30867     CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30868     if (needOppositeCond)
30869       CC = X86::GetOppositeBranchCondition(CC);
30870     return SetCC.getOperand(1);
30871   case X86ISD::CMOV: {
30872     // Check whether false/true value has canonical one, i.e. 0 or 1.
30873     ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30874     ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30875     // Quit if true value is not a constant.
30876     if (!TVal)
30877       return SDValue();
30878     // Quit if false value is not a constant.
30879     if (!FVal) {
30880       SDValue Op = SetCC.getOperand(0);
30881       // Skip 'zext' or 'trunc' node.
30882       if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30883           Op.getOpcode() == ISD::TRUNCATE)
30884         Op = Op.getOperand(0);
30885       // A special case for rdrand/rdseed, where 0 is set if false cond is
30886       // found.
30887       if ((Op.getOpcode() != X86ISD::RDRAND &&
30888            Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30889         return SDValue();
30890     }
30891     // Quit if false value is not the constant 0 or 1.
30892     bool FValIsFalse = true;
30893     if (FVal && FVal->getZExtValue() != 0) {
30894       if (FVal->getZExtValue() != 1)
30895         return SDValue();
30896       // If FVal is 1, opposite cond is needed.
30897       needOppositeCond = !needOppositeCond;
30898       FValIsFalse = false;
30899     }
30900     // Quit if TVal is not the constant opposite of FVal.
30901     if (FValIsFalse && TVal->getZExtValue() != 1)
30902       return SDValue();
30903     if (!FValIsFalse && TVal->getZExtValue() != 0)
30904       return SDValue();
30905     CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30906     if (needOppositeCond)
30907       CC = X86::GetOppositeBranchCondition(CC);
30908     return SetCC.getOperand(3);
30909   }
30910   }
30911
30912   return SDValue();
30913 }
30914
30915 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30916 /// Match:
30917 ///   (X86or (X86setcc) (X86setcc))
30918 ///   (X86cmp (and (X86setcc) (X86setcc)), 0)
30919 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30920                                            X86::CondCode &CC1, SDValue &Flags,
30921                                            bool &isAnd) {
30922   if (Cond->getOpcode() == X86ISD::CMP) {
30923     if (!isNullConstant(Cond->getOperand(1)))
30924       return false;
30925
30926     Cond = Cond->getOperand(0);
30927   }
30928
30929   isAnd = false;
30930
30931   SDValue SetCC0, SetCC1;
30932   switch (Cond->getOpcode()) {
30933   default: return false;
30934   case ISD::AND:
30935   case X86ISD::AND:
30936     isAnd = true;
30937     LLVM_FALLTHROUGH;
30938   case ISD::OR:
30939   case X86ISD::OR:
30940     SetCC0 = Cond->getOperand(0);
30941     SetCC1 = Cond->getOperand(1);
30942     break;
30943   };
30944
30945   // Make sure we have SETCC nodes, using the same flags value.
30946   if (SetCC0.getOpcode() != X86ISD::SETCC ||
30947       SetCC1.getOpcode() != X86ISD::SETCC ||
30948       SetCC0->getOperand(1) != SetCC1->getOperand(1))
30949     return false;
30950
30951   CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30952   CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30953   Flags = SetCC0->getOperand(1);
30954   return true;
30955 }
30956
30957 // When legalizing carry, we create carries via add X, -1
30958 // If that comes from an actual carry, via setcc, we use the
30959 // carry directly.
30960 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
30961   if (EFLAGS.getOpcode() == X86ISD::ADD) {
30962     if (isAllOnesConstant(EFLAGS.getOperand(1))) {
30963       SDValue Carry = EFLAGS.getOperand(0);
30964       while (Carry.getOpcode() == ISD::TRUNCATE ||
30965              Carry.getOpcode() == ISD::ZERO_EXTEND ||
30966              Carry.getOpcode() == ISD::SIGN_EXTEND ||
30967              Carry.getOpcode() == ISD::ANY_EXTEND ||
30968              (Carry.getOpcode() == ISD::AND &&
30969               isOneConstant(Carry.getOperand(1))))
30970         Carry = Carry.getOperand(0);
30971       if (Carry.getOpcode() == X86ISD::SETCC ||
30972           Carry.getOpcode() == X86ISD::SETCC_CARRY) {
30973         if (Carry.getConstantOperandVal(0) == X86::COND_B)
30974           return Carry.getOperand(1);
30975       }
30976     }
30977   }
30978
30979   return SDValue();
30980 }
30981
30982 /// Optimize an EFLAGS definition used according to the condition code \p CC
30983 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30984 /// uses of chain values.
30985 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30986                                   SelectionDAG &DAG) {
30987   if (CC == X86::COND_B)
30988     if (SDValue Flags = combineCarryThroughADD(EFLAGS))
30989       return Flags;
30990
30991   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30992     return R;
30993   return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30994 }
30995
30996 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30997 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30998                            TargetLowering::DAGCombinerInfo &DCI,
30999                            const X86Subtarget &Subtarget) {
31000   SDLoc DL(N);
31001
31002   // If the flag operand isn't dead, don't touch this CMOV.
31003   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
31004     return SDValue();
31005
31006   SDValue FalseOp = N->getOperand(0);
31007   SDValue TrueOp = N->getOperand(1);
31008   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
31009   SDValue Cond = N->getOperand(3);
31010
31011   if (CC == X86::COND_E || CC == X86::COND_NE) {
31012     switch (Cond.getOpcode()) {
31013     default: break;
31014     case X86ISD::BSR:
31015     case X86ISD::BSF:
31016       // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
31017       if (DAG.isKnownNeverZero(Cond.getOperand(0)))
31018         return (CC == X86::COND_E) ? FalseOp : TrueOp;
31019     }
31020   }
31021
31022   // Try to simplify the EFLAGS and condition code operands.
31023   // We can't always do this as FCMOV only supports a subset of X86 cond.
31024   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
31025     if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
31026       SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
31027         Flags};
31028       return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31029     }
31030   }
31031
31032   // If this is a select between two integer constants, try to do some
31033   // optimizations.  Note that the operands are ordered the opposite of SELECT
31034   // operands.
31035   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
31036     if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
31037       // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
31038       // larger than FalseC (the false value).
31039       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
31040         CC = X86::GetOppositeBranchCondition(CC);
31041         std::swap(TrueC, FalseC);
31042         std::swap(TrueOp, FalseOp);
31043       }
31044
31045       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
31046       // This is efficient for any integer data type (including i8/i16) and
31047       // shift amount.
31048       if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31049         Cond = getSETCC(CC, Cond, DL, DAG);
31050
31051         // Zero extend the condition if needed.
31052         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31053
31054         unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31055         Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31056                            DAG.getConstant(ShAmt, DL, MVT::i8));
31057         if (N->getNumValues() == 2)  // Dead flag value?
31058           return DCI.CombineTo(N, Cond, SDValue());
31059         return Cond;
31060       }
31061
31062       // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
31063       // for any integer data type, including i8/i16.
31064       if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31065         Cond = getSETCC(CC, Cond, DL, DAG);
31066
31067         // Zero extend the condition if needed.
31068         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31069                            FalseC->getValueType(0), Cond);
31070         Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31071                            SDValue(FalseC, 0));
31072
31073         if (N->getNumValues() == 2)  // Dead flag value?
31074           return DCI.CombineTo(N, Cond, SDValue());
31075         return Cond;
31076       }
31077
31078       // Optimize cases that will turn into an LEA instruction.  This requires
31079       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31080       if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31081         uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31082         if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31083
31084         bool isFastMultiplier = false;
31085         if (Diff < 10) {
31086           switch ((unsigned char)Diff) {
31087           default: break;
31088           case 1:  // result = add base, cond
31089           case 2:  // result = lea base(    , cond*2)
31090           case 3:  // result = lea base(cond, cond*2)
31091           case 4:  // result = lea base(    , cond*4)
31092           case 5:  // result = lea base(cond, cond*4)
31093           case 8:  // result = lea base(    , cond*8)
31094           case 9:  // result = lea base(cond, cond*8)
31095             isFastMultiplier = true;
31096             break;
31097           }
31098         }
31099
31100         if (isFastMultiplier) {
31101           APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31102           Cond = getSETCC(CC, Cond, DL ,DAG);
31103           // Zero extend the condition if needed.
31104           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31105                              Cond);
31106           // Scale the condition by the difference.
31107           if (Diff != 1)
31108             Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31109                                DAG.getConstant(Diff, DL, Cond.getValueType()));
31110
31111           // Add the base if non-zero.
31112           if (FalseC->getAPIntValue() != 0)
31113             Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31114                                SDValue(FalseC, 0));
31115           if (N->getNumValues() == 2)  // Dead flag value?
31116             return DCI.CombineTo(N, Cond, SDValue());
31117           return Cond;
31118         }
31119       }
31120     }
31121   }
31122
31123   // Handle these cases:
31124   //   (select (x != c), e, c) -> select (x != c), e, x),
31125   //   (select (x == c), c, e) -> select (x == c), x, e)
31126   // where the c is an integer constant, and the "select" is the combination
31127   // of CMOV and CMP.
31128   //
31129   // The rationale for this change is that the conditional-move from a constant
31130   // needs two instructions, however, conditional-move from a register needs
31131   // only one instruction.
31132   //
31133   // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31134   //  some instruction-combining opportunities. This opt needs to be
31135   //  postponed as late as possible.
31136   //
31137   if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31138     // the DCI.xxxx conditions are provided to postpone the optimization as
31139     // late as possible.
31140
31141     ConstantSDNode *CmpAgainst = nullptr;
31142     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31143         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31144         !isa<ConstantSDNode>(Cond.getOperand(0))) {
31145
31146       if (CC == X86::COND_NE &&
31147           CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31148         CC = X86::GetOppositeBranchCondition(CC);
31149         std::swap(TrueOp, FalseOp);
31150       }
31151
31152       if (CC == X86::COND_E &&
31153           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31154         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31155                           DAG.getConstant(CC, DL, MVT::i8), Cond };
31156         return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
31157       }
31158     }
31159   }
31160
31161   // Fold and/or of setcc's to double CMOV:
31162   //   (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31163   //   (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31164   //
31165   // This combine lets us generate:
31166   //   cmovcc1 (jcc1 if we don't have CMOV)
31167   //   cmovcc2 (same)
31168   // instead of:
31169   //   setcc1
31170   //   setcc2
31171   //   and/or
31172   //   cmovne (jne if we don't have CMOV)
31173   // When we can't use the CMOV instruction, it might increase branch
31174   // mispredicts.
31175   // When we can use CMOV, or when there is no mispredict, this improves
31176   // throughput and reduces register pressure.
31177   //
31178   if (CC == X86::COND_NE) {
31179     SDValue Flags;
31180     X86::CondCode CC0, CC1;
31181     bool isAndSetCC;
31182     if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31183       if (isAndSetCC) {
31184         std::swap(FalseOp, TrueOp);
31185         CC0 = X86::GetOppositeBranchCondition(CC0);
31186         CC1 = X86::GetOppositeBranchCondition(CC1);
31187       }
31188
31189       SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31190         Flags};
31191       SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31192       SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31193       SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31194       DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31195       return CMOV;
31196     }
31197   }
31198
31199   return SDValue();
31200 }
31201
31202 /// Different mul shrinking modes.
31203 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31204
31205 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31206   EVT VT = N->getOperand(0).getValueType();
31207   if (VT.getScalarSizeInBits() != 32)
31208     return false;
31209
31210   assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31211   unsigned SignBits[2] = {1, 1};
31212   bool IsPositive[2] = {false, false};
31213   for (unsigned i = 0; i < 2; i++) {
31214     SDValue Opd = N->getOperand(i);
31215
31216     // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31217     // compute signbits for it separately.
31218     if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31219       // For anyextend, it is safe to assume an appropriate number of leading
31220       // sign/zero bits.
31221       if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31222         SignBits[i] = 25;
31223       else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31224                MVT::i16)
31225         SignBits[i] = 17;
31226       else
31227         return false;
31228       IsPositive[i] = true;
31229     } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31230       // All the operands of BUILD_VECTOR need to be int constant.
31231       // Find the smallest value range which all the operands belong to.
31232       SignBits[i] = 32;
31233       IsPositive[i] = true;
31234       for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31235         if (SubOp.isUndef())
31236           continue;
31237         auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31238         if (!CN)
31239           return false;
31240         APInt IntVal = CN->getAPIntValue();
31241         if (IntVal.isNegative())
31242           IsPositive[i] = false;
31243         SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31244       }
31245     } else {
31246       SignBits[i] = DAG.ComputeNumSignBits(Opd);
31247       if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31248         IsPositive[i] = true;
31249     }
31250   }
31251
31252   bool AllPositive = IsPositive[0] && IsPositive[1];
31253   unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31254   // When ranges are from -128 ~ 127, use MULS8 mode.
31255   if (MinSignBits >= 25)
31256     Mode = MULS8;
31257   // When ranges are from 0 ~ 255, use MULU8 mode.
31258   else if (AllPositive && MinSignBits >= 24)
31259     Mode = MULU8;
31260   // When ranges are from -32768 ~ 32767, use MULS16 mode.
31261   else if (MinSignBits >= 17)
31262     Mode = MULS16;
31263   // When ranges are from 0 ~ 65535, use MULU16 mode.
31264   else if (AllPositive && MinSignBits >= 16)
31265     Mode = MULU16;
31266   else
31267     return false;
31268   return true;
31269 }
31270
31271 /// When the operands of vector mul are extended from smaller size values,
31272 /// like i8 and i16, the type of mul may be shrinked to generate more
31273 /// efficient code. Two typical patterns are handled:
31274 /// Pattern1:
31275 ///     %2 = sext/zext <N x i8> %1 to <N x i32>
31276 ///     %4 = sext/zext <N x i8> %3 to <N x i32>
31277 //   or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31278 ///     %5 = mul <N x i32> %2, %4
31279 ///
31280 /// Pattern2:
31281 ///     %2 = zext/sext <N x i16> %1 to <N x i32>
31282 ///     %4 = zext/sext <N x i16> %3 to <N x i32>
31283 ///  or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31284 ///     %5 = mul <N x i32> %2, %4
31285 ///
31286 /// There are four mul shrinking modes:
31287 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31288 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31289 /// generate pmullw+sext32 for it (MULS8 mode).
31290 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31291 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31292 /// generate pmullw+zext32 for it (MULU8 mode).
31293 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31294 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31295 /// generate pmullw+pmulhw for it (MULS16 mode).
31296 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31297 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31298 /// generate pmullw+pmulhuw for it (MULU16 mode).
31299 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31300                                const X86Subtarget &Subtarget) {
31301   // Check for legality
31302   // pmullw/pmulhw are not supported by SSE.
31303   if (!Subtarget.hasSSE2())
31304     return SDValue();
31305
31306   // Check for profitability
31307   // pmulld is supported since SSE41. It is better to use pmulld
31308   // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31309   // the expansion.
31310   bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31311   if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
31312     return SDValue();
31313
31314   ShrinkMode Mode;
31315   if (!canReduceVMulWidth(N, DAG, Mode))
31316     return SDValue();
31317
31318   SDLoc DL(N);
31319   SDValue N0 = N->getOperand(0);
31320   SDValue N1 = N->getOperand(1);
31321   EVT VT = N->getOperand(0).getValueType();
31322   unsigned RegSize = 128;
31323   MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31324   EVT ReducedVT =
31325       EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31326   // Shrink the operands of mul.
31327   SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31328   SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31329
31330   if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
31331     // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31332     // lower part is needed.
31333     SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31334     if (Mode == MULU8 || Mode == MULS8) {
31335       return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
31336                          DL, VT, MulLo);
31337     } else {
31338       MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31339       // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31340       // the higher part is also needed.
31341       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31342                                   ReducedVT, NewN0, NewN1);
31343
31344       // Repack the lower part and higher part result of mul into a wider
31345       // result.
31346       // Generate shuffle functioning as punpcklwd.
31347       SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31348       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31349         ShuffleMask[2 * i] = i;
31350         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31351       }
31352       SDValue ResLo =
31353           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31354       ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31355       // Generate shuffle functioning as punpckhwd.
31356       for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31357         ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31358         ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31359       }
31360       SDValue ResHi =
31361           DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31362       ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31363       return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31364     }
31365   } else {
31366     // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31367     // to legalize the mul explicitly because implicit legalization for type
31368     // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31369     // instructions which will not exist when we explicitly legalize it by
31370     // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31371     // <4 x i16> undef).
31372     //
31373     // Legalize the operands of mul.
31374     // FIXME: We may be able to handle non-concatenated vectors by insertion.
31375     unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31376     if ((RegSize % ReducedSizeInBits) != 0)
31377       return SDValue();
31378
31379     SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31380                                  DAG.getUNDEF(ReducedVT));
31381     Ops[0] = NewN0;
31382     NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31383     Ops[0] = NewN1;
31384     NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31385
31386     if (Mode == MULU8 || Mode == MULS8) {
31387       // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31388       // part is needed.
31389       SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31390
31391       // convert the type of mul result to VT.
31392       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31393       SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31394                                               : ISD::SIGN_EXTEND_VECTOR_INREG,
31395                                 DL, ResVT, Mul);
31396       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31397                          DAG.getIntPtrConstant(0, DL));
31398     } else {
31399       // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31400       // MULU16/MULS16, both parts are needed.
31401       SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31402       SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31403                                   OpsVT, NewN0, NewN1);
31404
31405       // Repack the lower part and higher part result of mul into a wider
31406       // result. Make sure the type of mul result is VT.
31407       MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31408       SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31409       Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31410       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31411                          DAG.getIntPtrConstant(0, DL));
31412     }
31413   }
31414 }
31415
31416 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31417                                  EVT VT, SDLoc DL) {
31418
31419   auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31420     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31421                                  DAG.getConstant(Mult, DL, VT));
31422     Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31423                          DAG.getConstant(Shift, DL, MVT::i8));
31424     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31425                          N->getOperand(0));
31426     return Result;
31427   };
31428
31429   auto combineMulMulAddOrSub = [&](bool isAdd) {
31430     SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31431                                  DAG.getConstant(9, DL, VT));
31432     Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31433     Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31434                          N->getOperand(0));
31435     return Result;
31436   };
31437
31438   switch (MulAmt) {
31439   default:
31440     break;
31441   case 11:
31442     // mul x, 11 => add ((shl (mul x, 5), 1), x)
31443     return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
31444   case 21:
31445     // mul x, 21 => add ((shl (mul x, 5), 2), x)
31446     return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
31447   case 22:
31448     // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31449     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31450                        combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
31451   case 19:
31452     // mul x, 19 => sub ((shl (mul x, 5), 2), x)
31453     return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
31454   case 13:
31455     // mul x, 13 => add ((shl (mul x, 3), 2), x)
31456     return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
31457   case 23:
31458     // mul x, 13 => sub ((shl (mul x, 3), 3), x)
31459     return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
31460   case 14:
31461     // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31462     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31463                        combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
31464   case 26:
31465     // mul x, 26 => sub ((mul (mul x, 9), 3), x)
31466     return combineMulMulAddOrSub(/*isAdd*/ false);
31467   case 28:
31468     // mul x, 28 => add ((mul (mul x, 9), 3), x)
31469     return combineMulMulAddOrSub(/*isAdd*/ true);
31470   case 29:
31471     // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31472     return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31473                        combineMulMulAddOrSub(/*isAdd*/ true));
31474   case 30:
31475     // mul x, 30 => sub (sub ((shl x, 5), x), x)
31476     return DAG.getNode(
31477         ISD::SUB, DL, VT,
31478         DAG.getNode(ISD::SUB, DL, VT,
31479                     DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31480                                 DAG.getConstant(5, DL, MVT::i8)),
31481                     N->getOperand(0)),
31482         N->getOperand(0));
31483   }
31484   return SDValue();
31485 }
31486
31487 /// Optimize a single multiply with constant into two operations in order to
31488 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31489 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31490                           TargetLowering::DAGCombinerInfo &DCI,
31491                           const X86Subtarget &Subtarget) {
31492   EVT VT = N->getValueType(0);
31493   if (DCI.isBeforeLegalize() && VT.isVector())
31494     return reduceVMULWidth(N, DAG, Subtarget);
31495
31496   if (!MulConstantOptimization)
31497     return SDValue();
31498   // An imul is usually smaller than the alternative sequence.
31499   if (DAG.getMachineFunction().getFunction()->optForMinSize())
31500     return SDValue();
31501
31502   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31503     return SDValue();
31504
31505   if (VT != MVT::i64 && VT != MVT::i32)
31506     return SDValue();
31507
31508   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31509   if (!C)
31510     return SDValue();
31511   uint64_t MulAmt = C->getZExtValue();
31512   if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31513     return SDValue();
31514
31515   uint64_t MulAmt1 = 0;
31516   uint64_t MulAmt2 = 0;
31517   if ((MulAmt % 9) == 0) {
31518     MulAmt1 = 9;
31519     MulAmt2 = MulAmt / 9;
31520   } else if ((MulAmt % 5) == 0) {
31521     MulAmt1 = 5;
31522     MulAmt2 = MulAmt / 5;
31523   } else if ((MulAmt % 3) == 0) {
31524     MulAmt1 = 3;
31525     MulAmt2 = MulAmt / 3;
31526   }
31527
31528   SDLoc DL(N);
31529   SDValue NewMul;
31530   if (MulAmt2 &&
31531       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31532
31533     if (isPowerOf2_64(MulAmt2) &&
31534         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31535       // If second multiplifer is pow2, issue it first. We want the multiply by
31536       // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31537       // is an add.
31538       std::swap(MulAmt1, MulAmt2);
31539
31540     if (isPowerOf2_64(MulAmt1))
31541       NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31542                            DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31543     else
31544       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31545                            DAG.getConstant(MulAmt1, DL, VT));
31546
31547     if (isPowerOf2_64(MulAmt2))
31548       NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31549                            DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31550     else
31551       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31552                            DAG.getConstant(MulAmt2, DL, VT));
31553   } else if (!Subtarget.slowLEA())
31554     NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31555
31556   if (!NewMul) {
31557     assert(MulAmt != 0 &&
31558            MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31559            "Both cases that could cause potential overflows should have "
31560            "already been handled.");
31561     int64_t SignMulAmt = C->getSExtValue();
31562     if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31563         (SignMulAmt != -INT64_MAX)) {
31564       int NumSign = SignMulAmt > 0 ? 1 : -1;
31565       bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31566       bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31567       if (IsPowerOf2_64PlusOne) {
31568         // (mul x, 2^N + 1) => (add (shl x, N), x)
31569         NewMul = DAG.getNode(
31570             ISD::ADD, DL, VT, N->getOperand(0),
31571             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31572                         DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31573                                         MVT::i8)));
31574       } else if (IsPowerOf2_64MinusOne) {
31575         // (mul x, 2^N - 1) => (sub (shl x, N), x)
31576         NewMul = DAG.getNode(
31577             ISD::SUB, DL, VT,
31578             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31579                         DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31580                                         MVT::i8)),
31581             N->getOperand(0));
31582       }
31583       // To negate, subtract the number from zero
31584       if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31585         NewMul =
31586             DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31587     }
31588   }
31589
31590   if (NewMul)
31591     // Do not add new nodes to DAG combiner worklist.
31592     DCI.CombineTo(N, NewMul, false);
31593
31594   return SDValue();
31595 }
31596
31597 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31598   SDValue N0 = N->getOperand(0);
31599   SDValue N1 = N->getOperand(1);
31600   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31601   EVT VT = N0.getValueType();
31602
31603   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31604   // since the result of setcc_c is all zero's or all ones.
31605   if (VT.isInteger() && !VT.isVector() &&
31606       N1C && N0.getOpcode() == ISD::AND &&
31607       N0.getOperand(1).getOpcode() == ISD::Constant) {
31608     SDValue N00 = N0.getOperand(0);
31609     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31610     Mask <<= N1C->getAPIntValue();
31611     bool MaskOK = false;
31612     // We can handle cases concerning bit-widening nodes containing setcc_c if
31613     // we carefully interrogate the mask to make sure we are semantics
31614     // preserving.
31615     // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31616     // of the underlying setcc_c operation if the setcc_c was zero extended.
31617     // Consider the following example:
31618     //   zext(setcc_c)                 -> i32 0x0000FFFF
31619     //   c1                            -> i32 0x0000FFFF
31620     //   c2                            -> i32 0x00000001
31621     //   (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31622     //   (and setcc_c, (c1 << c2))     -> i32 0x0000FFFE
31623     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31624       MaskOK = true;
31625     } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31626                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31627       MaskOK = true;
31628     } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31629                 N00.getOpcode() == ISD::ANY_EXTEND) &&
31630                N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31631       MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31632     }
31633     if (MaskOK && Mask != 0) {
31634       SDLoc DL(N);
31635       return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31636     }
31637   }
31638
31639   // Hardware support for vector shifts is sparse which makes us scalarize the
31640   // vector operations in many cases. Also, on sandybridge ADD is faster than
31641   // shl.
31642   // (shl V, 1) -> add V,V
31643   if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31644     if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31645       assert(N0.getValueType().isVector() && "Invalid vector shift type");
31646       // We shift all of the values by one. In many cases we do not have
31647       // hardware support for this operation. This is better expressed as an ADD
31648       // of two values.
31649       if (N1SplatC->getAPIntValue() == 1)
31650         return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31651     }
31652
31653   return SDValue();
31654 }
31655
31656 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31657   SDValue N0 = N->getOperand(0);
31658   SDValue N1 = N->getOperand(1);
31659   EVT VT = N0.getValueType();
31660   unsigned Size = VT.getSizeInBits();
31661
31662   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31663   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31664   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31665   // depending on sign of (SarConst - [56,48,32,24,16])
31666
31667   // sexts in X86 are MOVs. The MOVs have the same code size
31668   // as above SHIFTs (only SHIFT on 1 has lower code size).
31669   // However the MOVs have 2 advantages to a SHIFT:
31670   // 1. MOVs can write to a register that differs from source
31671   // 2. MOVs accept memory operands
31672
31673   if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31674       N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31675       N0.getOperand(1).getOpcode() != ISD::Constant)
31676     return SDValue();
31677
31678   SDValue N00 = N0.getOperand(0);
31679   SDValue N01 = N0.getOperand(1);
31680   APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31681   APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31682   EVT CVT = N1.getValueType();
31683
31684   if (SarConst.isNegative())
31685     return SDValue();
31686
31687   for (MVT SVT : MVT::integer_valuetypes()) {
31688     unsigned ShiftSize = SVT.getSizeInBits();
31689     // skipping types without corresponding sext/zext and
31690     // ShlConst that is not one of [56,48,32,24,16]
31691     if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31692       continue;
31693     SDLoc DL(N);
31694     SDValue NN =
31695         DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31696     SarConst = SarConst - (Size - ShiftSize);
31697     if (SarConst == 0)
31698       return NN;
31699     else if (SarConst.isNegative())
31700       return DAG.getNode(ISD::SHL, DL, VT, NN,
31701                          DAG.getConstant(-SarConst, DL, CVT));
31702     else
31703       return DAG.getNode(ISD::SRA, DL, VT, NN,
31704                          DAG.getConstant(SarConst, DL, CVT));
31705   }
31706   return SDValue();
31707 }
31708
31709 /// \brief Returns a vector of 0s if the node in input is a vector logical
31710 /// shift by a constant amount which is known to be bigger than or equal
31711 /// to the vector element size in bits.
31712 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31713                                       const X86Subtarget &Subtarget) {
31714   EVT VT = N->getValueType(0);
31715
31716   if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31717       (!Subtarget.hasInt256() ||
31718        (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31719     return SDValue();
31720
31721   SDValue Amt = N->getOperand(1);
31722   SDLoc DL(N);
31723   if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31724     if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31725       const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31726       unsigned MaxAmount =
31727         VT.getSimpleVT().getScalarSizeInBits();
31728
31729       // SSE2/AVX2 logical shifts always return a vector of 0s
31730       // if the shift amount is bigger than or equal to
31731       // the element size. The constant shift amount will be
31732       // encoded as a 8-bit immediate.
31733       if (ShiftAmt.trunc(8).uge(MaxAmount))
31734         return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31735     }
31736
31737   return SDValue();
31738 }
31739
31740 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31741                             TargetLowering::DAGCombinerInfo &DCI,
31742                             const X86Subtarget &Subtarget) {
31743   if (N->getOpcode() == ISD::SHL)
31744     if (SDValue V = combineShiftLeft(N, DAG))
31745       return V;
31746
31747   if (N->getOpcode() == ISD::SRA)
31748     if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31749       return V;
31750
31751   // Try to fold this logical shift into a zero vector.
31752   if (N->getOpcode() != ISD::SRA)
31753     if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31754       return V;
31755
31756   return SDValue();
31757 }
31758
31759 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31760                                      TargetLowering::DAGCombinerInfo &DCI,
31761                                      const X86Subtarget &Subtarget) {
31762   unsigned Opcode = N->getOpcode();
31763   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31764           X86ISD::VSRLI == Opcode) &&
31765          "Unexpected shift opcode");
31766   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31767   EVT VT = N->getValueType(0);
31768   SDValue N0 = N->getOperand(0);
31769   SDValue N1 = N->getOperand(1);
31770   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31771   assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31772          "Unexpected value type");
31773
31774   // Out of range logical bit shifts are guaranteed to be zero.
31775   // Out of range arithmetic bit shifts splat the sign bit.
31776   APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31777   if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31778     if (LogicalShift)
31779       return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31780     else
31781       ShiftVal = NumBitsPerElt - 1;
31782   }
31783
31784   // Shift N0 by zero -> N0.
31785   if (!ShiftVal)
31786     return N0;
31787
31788   // Shift zero -> zero.
31789   if (ISD::isBuildVectorAllZeros(N0.getNode()))
31790     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31791
31792   // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31793   // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31794   // TODO - support other sra opcodes as needed.
31795   if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31796       N0.getOpcode() == X86ISD::VSRAI)
31797     return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31798
31799   // We can decode 'whole byte' logical bit shifts as shuffles.
31800   if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31801     SDValue Op(N, 0);
31802     SmallVector<int, 1> NonceMask; // Just a placeholder.
31803     NonceMask.push_back(0);
31804     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31805                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31806                                       DCI, Subtarget))
31807       return SDValue(); // This routine will use CombineTo to replace N.
31808   }
31809
31810   // Constant Folding.
31811   APInt UndefElts;
31812   SmallVector<APInt, 32> EltBits;
31813   if (N->isOnlyUserOf(N0.getNode()) &&
31814       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31815     assert(EltBits.size() == VT.getVectorNumElements() &&
31816            "Unexpected shift value type");
31817     unsigned ShiftImm = ShiftVal.getZExtValue();
31818     for (APInt &Elt : EltBits) {
31819       if (X86ISD::VSHLI == Opcode)
31820         Elt <<= ShiftImm;
31821       else if (X86ISD::VSRAI == Opcode)
31822         Elt.ashrInPlace(ShiftImm);
31823       else
31824         Elt.lshrInPlace(ShiftImm);
31825     }
31826     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31827   }
31828
31829   return SDValue();
31830 }
31831
31832 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31833                                    TargetLowering::DAGCombinerInfo &DCI,
31834                                    const X86Subtarget &Subtarget) {
31835   assert(
31836       ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31837        (N->getOpcode() == X86ISD::PINSRW &&
31838         N->getValueType(0) == MVT::v8i16)) &&
31839       "Unexpected vector insertion");
31840
31841   // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31842   SDValue Op(N, 0);
31843   SmallVector<int, 1> NonceMask; // Just a placeholder.
31844   NonceMask.push_back(0);
31845   combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31846                                 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31847                                 DCI, Subtarget);
31848   return SDValue();
31849 }
31850
31851 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31852 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31853 /// OR -> CMPNEQSS.
31854 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31855                                    TargetLowering::DAGCombinerInfo &DCI,
31856                                    const X86Subtarget &Subtarget) {
31857   unsigned opcode;
31858
31859   // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31860   // we're requiring SSE2 for both.
31861   if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31862     SDValue N0 = N->getOperand(0);
31863     SDValue N1 = N->getOperand(1);
31864     SDValue CMP0 = N0->getOperand(1);
31865     SDValue CMP1 = N1->getOperand(1);
31866     SDLoc DL(N);
31867
31868     // The SETCCs should both refer to the same CMP.
31869     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31870       return SDValue();
31871
31872     SDValue CMP00 = CMP0->getOperand(0);
31873     SDValue CMP01 = CMP0->getOperand(1);
31874     EVT     VT    = CMP00.getValueType();
31875
31876     if (VT == MVT::f32 || VT == MVT::f64) {
31877       bool ExpectingFlags = false;
31878       // Check for any users that want flags:
31879       for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31880            !ExpectingFlags && UI != UE; ++UI)
31881         switch (UI->getOpcode()) {
31882         default:
31883         case ISD::BR_CC:
31884         case ISD::BRCOND:
31885         case ISD::SELECT:
31886           ExpectingFlags = true;
31887           break;
31888         case ISD::CopyToReg:
31889         case ISD::SIGN_EXTEND:
31890         case ISD::ZERO_EXTEND:
31891         case ISD::ANY_EXTEND:
31892           break;
31893         }
31894
31895       if (!ExpectingFlags) {
31896         enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31897         enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31898
31899         if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31900           X86::CondCode tmp = cc0;
31901           cc0 = cc1;
31902           cc1 = tmp;
31903         }
31904
31905         if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
31906             (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31907           // FIXME: need symbolic constants for these magic numbers.
31908           // See X86ATTInstPrinter.cpp:printSSECC().
31909           unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31910           if (Subtarget.hasAVX512()) {
31911             SDValue FSetCC =
31912                 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31913                             DAG.getConstant(x86cc, DL, MVT::i8));
31914             return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31915                                FSetCC, DAG.getIntPtrConstant(0, DL));
31916           }
31917           SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31918                                               CMP00.getValueType(), CMP00, CMP01,
31919                                               DAG.getConstant(x86cc, DL,
31920                                                               MVT::i8));
31921
31922           bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31923           MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31924
31925           if (is64BitFP && !Subtarget.is64Bit()) {
31926             // On a 32-bit target, we cannot bitcast the 64-bit float to a
31927             // 64-bit integer, since that's not a legal type. Since
31928             // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31929             // bits, but can do this little dance to extract the lowest 32 bits
31930             // and work with those going forward.
31931             SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31932                                            OnesOrZeroesF);
31933             SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31934             OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31935                                         Vector32, DAG.getIntPtrConstant(0, DL));
31936             IntVT = MVT::i32;
31937           }
31938
31939           SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31940           SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31941                                       DAG.getConstant(1, DL, IntVT));
31942           SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31943                                               ANDed);
31944           return OneBitOfTruth;
31945         }
31946       }
31947     }
31948   }
31949   return SDValue();
31950 }
31951
31952 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31953 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31954   assert(N->getOpcode() == ISD::AND);
31955
31956   EVT VT = N->getValueType(0);
31957   SDValue N0 = N->getOperand(0);
31958   SDValue N1 = N->getOperand(1);
31959   SDLoc DL(N);
31960
31961   if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31962     return SDValue();
31963
31964   if (N0.getOpcode() == ISD::XOR &&
31965       ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31966     return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31967
31968   if (N1.getOpcode() == ISD::XOR &&
31969       ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31970     return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31971
31972   return SDValue();
31973 }
31974
31975 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31976 // register. In most cases we actually compare or select YMM-sized registers
31977 // and mixing the two types creates horrible code. This method optimizes
31978 // some of the transition sequences.
31979 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31980                                  TargetLowering::DAGCombinerInfo &DCI,
31981                                  const X86Subtarget &Subtarget) {
31982   EVT VT = N->getValueType(0);
31983   if (!VT.is256BitVector())
31984     return SDValue();
31985
31986   assert((N->getOpcode() == ISD::ANY_EXTEND ||
31987           N->getOpcode() == ISD::ZERO_EXTEND ||
31988           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31989
31990   SDValue Narrow = N->getOperand(0);
31991   EVT NarrowVT = Narrow->getValueType(0);
31992   if (!NarrowVT.is128BitVector())
31993     return SDValue();
31994
31995   if (Narrow->getOpcode() != ISD::XOR &&
31996       Narrow->getOpcode() != ISD::AND &&
31997       Narrow->getOpcode() != ISD::OR)
31998     return SDValue();
31999
32000   SDValue N0  = Narrow->getOperand(0);
32001   SDValue N1  = Narrow->getOperand(1);
32002   SDLoc DL(Narrow);
32003
32004   // The Left side has to be a trunc.
32005   if (N0.getOpcode() != ISD::TRUNCATE)
32006     return SDValue();
32007
32008   // The type of the truncated inputs.
32009   EVT WideVT = N0->getOperand(0)->getValueType(0);
32010   if (WideVT != VT)
32011     return SDValue();
32012
32013   // The right side has to be a 'trunc' or a constant vector.
32014   bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
32015   ConstantSDNode *RHSConstSplat = nullptr;
32016   if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
32017     RHSConstSplat = RHSBV->getConstantSplatNode();
32018   if (!RHSTrunc && !RHSConstSplat)
32019     return SDValue();
32020
32021   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32022
32023   if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
32024     return SDValue();
32025
32026   // Set N0 and N1 to hold the inputs to the new wide operation.
32027   N0 = N0->getOperand(0);
32028   if (RHSConstSplat) {
32029     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
32030                      SDValue(RHSConstSplat, 0));
32031     N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
32032   } else if (RHSTrunc) {
32033     N1 = N1->getOperand(0);
32034   }
32035
32036   // Generate the wide operation.
32037   SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
32038   unsigned Opcode = N->getOpcode();
32039   switch (Opcode) {
32040   case ISD::ANY_EXTEND:
32041     return Op;
32042   case ISD::ZERO_EXTEND: {
32043     unsigned InBits = NarrowVT.getScalarSizeInBits();
32044     APInt Mask = APInt::getAllOnesValue(InBits);
32045     Mask = Mask.zext(VT.getScalarSizeInBits());
32046     return DAG.getNode(ISD::AND, DL, VT,
32047                        Op, DAG.getConstant(Mask, DL, VT));
32048   }
32049   case ISD::SIGN_EXTEND:
32050     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
32051                        Op, DAG.getValueType(NarrowVT));
32052   default:
32053     llvm_unreachable("Unexpected opcode");
32054   }
32055 }
32056
32057 /// If both input operands of a logic op are being cast from floating point
32058 /// types, try to convert this into a floating point logic node to avoid
32059 /// unnecessary moves from SSE to integer registers.
32060 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
32061                                         const X86Subtarget &Subtarget) {
32062   unsigned FPOpcode = ISD::DELETED_NODE;
32063   if (N->getOpcode() == ISD::AND)
32064     FPOpcode = X86ISD::FAND;
32065   else if (N->getOpcode() == ISD::OR)
32066     FPOpcode = X86ISD::FOR;
32067   else if (N->getOpcode() == ISD::XOR)
32068     FPOpcode = X86ISD::FXOR;
32069
32070   assert(FPOpcode != ISD::DELETED_NODE &&
32071          "Unexpected input node for FP logic conversion");
32072
32073   EVT VT = N->getValueType(0);
32074   SDValue N0 = N->getOperand(0);
32075   SDValue N1 = N->getOperand(1);
32076   SDLoc DL(N);
32077   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
32078       ((Subtarget.hasSSE1() && VT == MVT::i32) ||
32079        (Subtarget.hasSSE2() && VT == MVT::i64))) {
32080     SDValue N00 = N0.getOperand(0);
32081     SDValue N10 = N1.getOperand(0);
32082     EVT N00Type = N00.getValueType();
32083     EVT N10Type = N10.getValueType();
32084     if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
32085       SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32086       return DAG.getBitcast(VT, FPLogic);
32087     }
32088   }
32089   return SDValue();
32090 }
32091
32092 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
32093 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32094 /// with a shift-right to eliminate loading the vector constant mask value.
32095 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32096                                      const X86Subtarget &Subtarget) {
32097   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32098   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32099   EVT VT0 = Op0.getValueType();
32100   EVT VT1 = Op1.getValueType();
32101
32102   if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
32103     return SDValue();
32104
32105   APInt SplatVal;
32106   if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal,
32107                                   /*AllowShrink*/false) ||
32108       !SplatVal.isMask())
32109     return SDValue();
32110
32111   if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
32112     return SDValue();
32113
32114   unsigned EltBitWidth = VT0.getScalarSizeInBits();
32115   if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32116     return SDValue();
32117
32118   SDLoc DL(N);
32119   unsigned ShiftVal = SplatVal.countTrailingOnes();
32120   SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32121   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32122   return DAG.getBitcast(N->getValueType(0), Shift);
32123 }
32124
32125 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32126                           TargetLowering::DAGCombinerInfo &DCI,
32127                           const X86Subtarget &Subtarget) {
32128   if (DCI.isBeforeLegalizeOps())
32129     return SDValue();
32130
32131   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32132     return R;
32133
32134   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32135     return FPLogic;
32136
32137   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
32138     return R;
32139
32140   if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
32141     return ShiftRight;
32142
32143   EVT VT = N->getValueType(0);
32144   SDValue N0 = N->getOperand(0);
32145   SDValue N1 = N->getOperand(1);
32146   SDLoc DL(N);
32147
32148   // Attempt to recursively combine a bitmask AND with shuffles.
32149   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
32150     SDValue Op(N, 0);
32151     SmallVector<int, 1> NonceMask; // Just a placeholder.
32152     NonceMask.push_back(0);
32153     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
32154                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
32155                                       DCI, Subtarget))
32156       return SDValue(); // This routine will use CombineTo to replace N.
32157   }
32158
32159   // Create BEXTR instructions
32160   // BEXTR is ((X >> imm) & (2**size-1))
32161   if (VT != MVT::i32 && VT != MVT::i64)
32162     return SDValue();
32163
32164   if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
32165     return SDValue();
32166   if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
32167     return SDValue();
32168
32169   ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
32170   ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32171   if (MaskNode && ShiftNode) {
32172     uint64_t Mask = MaskNode->getZExtValue();
32173     uint64_t Shift = ShiftNode->getZExtValue();
32174     if (isMask_64(Mask)) {
32175       uint64_t MaskSize = countPopulation(Mask);
32176       if (Shift + MaskSize <= VT.getSizeInBits())
32177         return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
32178                            DAG.getConstant(Shift | (MaskSize << 8), DL,
32179                                            VT));
32180     }
32181   }
32182   return SDValue();
32183 }
32184
32185 // Try to fold:
32186 //   (or (and (m, y), (pandn m, x)))
32187 // into:
32188 //   (vselect m, x, y)
32189 // As a special case, try to fold:
32190 //   (or (and (m, (sub 0, x)), (pandn m, x)))
32191 // into:
32192 //   (sub (xor X, M), M)
32193 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32194                                             const X86Subtarget &Subtarget) {
32195   assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
32196
32197   SDValue N0 = N->getOperand(0);
32198   SDValue N1 = N->getOperand(1);
32199   EVT VT = N->getValueType(0);
32200
32201   if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
32202         (VT.is256BitVector() && Subtarget.hasInt256())))
32203     return SDValue();
32204
32205   // Canonicalize AND to LHS.
32206   if (N1.getOpcode() == ISD::AND)
32207     std::swap(N0, N1);
32208
32209   // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32210   // ANDNP combine allows other combines to happen that prevent matching.
32211   if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
32212     return SDValue();
32213
32214   SDValue Mask = N1.getOperand(0);
32215   SDValue X = N1.getOperand(1);
32216   SDValue Y;
32217   if (N0.getOperand(0) == Mask)
32218     Y = N0.getOperand(1);
32219   if (N0.getOperand(1) == Mask)
32220     Y = N0.getOperand(0);
32221
32222   // Check to see if the mask appeared in both the AND and ANDNP.
32223   if (!Y.getNode())
32224     return SDValue();
32225
32226   // Validate that X, Y, and Mask are bitcasts, and see through them.
32227   Mask = peekThroughBitcasts(Mask);
32228   X = peekThroughBitcasts(X);
32229   Y = peekThroughBitcasts(Y);
32230
32231   EVT MaskVT = Mask.getValueType();
32232   unsigned EltBits = MaskVT.getScalarSizeInBits();
32233
32234   // TODO: Attempt to handle floating point cases as well?
32235   if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
32236     return SDValue();
32237
32238   SDLoc DL(N);
32239
32240   // Try to match:
32241   //   (or (and (M, (sub 0, X)), (pandn M, X)))
32242   // which is a special case of vselect:
32243   //   (vselect M, (sub 0, X), X)
32244   // Per:
32245   // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32246   // We know that, if fNegate is 0 or 1:
32247   //   (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32248   //
32249   // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32250   //   ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32251   //   ( M      ? -X : X) == ((X ^   M     ) + (M & 1))
32252   // This lets us transform our vselect to:
32253   //   (add (xor X, M), (and M, 1))
32254   // And further to:
32255   //   (sub (xor X, M), M)
32256   if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
32257       DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
32258     auto IsNegV = [](SDNode *N, SDValue V) {
32259       return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32260         ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32261     };
32262     SDValue V;
32263     if (IsNegV(Y.getNode(), X))
32264       V = X;
32265     else if (IsNegV(X.getNode(), Y))
32266       V = Y;
32267
32268     if (V) {
32269       SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32270       SDValue SubOp2 = Mask;
32271
32272       // If the negate was on the false side of the select, then
32273       // the operands of the SUB need to be swapped. PR 27251.
32274       // This is because the pattern being matched above is
32275       // (vselect M, (sub (0, X), X)  -> (sub (xor X, M), M)
32276       // but if the pattern matched was
32277       // (vselect M, X, (sub (0, X))), that is really negation of the pattern
32278       // above, -(vselect M, (sub 0, X), X), and therefore the replacement
32279       // pattern also needs to be a negation of the replacement pattern above.
32280       // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32281       // sub accomplishes the negation of the replacement pattern.
32282       if (V == Y)
32283          std::swap(SubOp1, SubOp2);
32284
32285       SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32286       return DAG.getBitcast(VT, Res);
32287     }
32288   }
32289
32290   // PBLENDVB is only available on SSE 4.1.
32291   if (!Subtarget.hasSSE41())
32292     return SDValue();
32293
32294   MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
32295
32296   X = DAG.getBitcast(BlendVT, X);
32297   Y = DAG.getBitcast(BlendVT, Y);
32298   Mask = DAG.getBitcast(BlendVT, Mask);
32299   Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32300   return DAG.getBitcast(VT, Mask);
32301 }
32302
32303 // Helper function for combineOrCmpEqZeroToCtlzSrl
32304 // Transforms:
32305 //   seteq(cmp x, 0)
32306 //   into:
32307 //   srl(ctlz x), log2(bitsize(x))
32308 // Input pattern is checked by caller.
32309 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32310                                           SelectionDAG &DAG) {
32311   SDValue Cmp = Op.getOperand(1);
32312   EVT VT = Cmp.getOperand(0).getValueType();
32313   unsigned Log2b = Log2_32(VT.getSizeInBits());
32314   SDLoc dl(Op);
32315   SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32316   // The result of the shift is true or false, and on X86, the 32-bit
32317   // encoding of shr and lzcnt is more desirable.
32318   SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32319   SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32320                             DAG.getConstant(Log2b, dl, VT));
32321   return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32322 }
32323
32324 // Try to transform:
32325 //   zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32326 //   into:
32327 //   srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32328 // Will also attempt to match more generic cases, eg:
32329 //   zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32330 // Only applies if the target supports the FastLZCNT feature.
32331 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32332                                            TargetLowering::DAGCombinerInfo &DCI,
32333                                            const X86Subtarget &Subtarget) {
32334   if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
32335     return SDValue();
32336
32337   auto isORCandidate = [](SDValue N) {
32338     return (N->getOpcode() == ISD::OR && N->hasOneUse());
32339   };
32340
32341   // Check the zero extend is extending to 32-bit or more. The code generated by
32342   // srl(ctlz) for 16-bit or less variants of the pattern would require extra
32343   // instructions to clear the upper bits.
32344   if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
32345       !isORCandidate(N->getOperand(0)))
32346     return SDValue();
32347
32348   // Check the node matches: setcc(eq, cmp 0)
32349   auto isSetCCCandidate = [](SDValue N) {
32350     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32351            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32352            N->getOperand(1).getOpcode() == X86ISD::CMP &&
32353            isNullConstant(N->getOperand(1).getOperand(1)) &&
32354            N->getOperand(1).getValueType().bitsGE(MVT::i32);
32355   };
32356
32357   SDNode *OR = N->getOperand(0).getNode();
32358   SDValue LHS = OR->getOperand(0);
32359   SDValue RHS = OR->getOperand(1);
32360
32361   // Save nodes matching or(or, setcc(eq, cmp 0)).
32362   SmallVector<SDNode *, 2> ORNodes;
32363   while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
32364           (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
32365     ORNodes.push_back(OR);
32366     OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
32367     LHS = OR->getOperand(0);
32368     RHS = OR->getOperand(1);
32369   }
32370
32371   // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32372   if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
32373       !isORCandidate(SDValue(OR, 0)))
32374     return SDValue();
32375
32376   // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32377   // to
32378   // or(srl(ctlz),srl(ctlz)).
32379   // The dag combiner can then fold it into:
32380   // srl(or(ctlz, ctlz)).
32381   EVT VT = OR->getValueType(0);
32382   SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32383   SDValue Ret, NewRHS;
32384   if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
32385     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32386
32387   if (!Ret)
32388     return SDValue();
32389
32390   // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32391   while (ORNodes.size() > 0) {
32392     OR = ORNodes.pop_back_val();
32393     LHS = OR->getOperand(0);
32394     RHS = OR->getOperand(1);
32395     // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32396     if (RHS->getOpcode() == ISD::OR)
32397       std::swap(LHS, RHS);
32398     EVT VT = OR->getValueType(0);
32399     SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32400     if (!NewRHS)
32401       return SDValue();
32402     Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32403   }
32404
32405   if (Ret)
32406     Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32407
32408   return Ret;
32409 }
32410
32411 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32412                          TargetLowering::DAGCombinerInfo &DCI,
32413                          const X86Subtarget &Subtarget) {
32414   if (DCI.isBeforeLegalizeOps())
32415     return SDValue();
32416
32417   if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32418     return R;
32419
32420   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32421     return FPLogic;
32422
32423   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32424     return R;
32425
32426   SDValue N0 = N->getOperand(0);
32427   SDValue N1 = N->getOperand(1);
32428   EVT VT = N->getValueType(0);
32429
32430   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
32431     return SDValue();
32432
32433   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
32434   bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32435
32436   // SHLD/SHRD instructions have lower register pressure, but on some
32437   // platforms they have higher latency than the equivalent
32438   // series of shifts/or that would otherwise be generated.
32439   // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
32440   // have higher latencies and we are not optimizing for size.
32441   if (!OptForSize && Subtarget.isSHLDSlow())
32442     return SDValue();
32443
32444   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
32445     std::swap(N0, N1);
32446   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
32447     return SDValue();
32448   if (!N0.hasOneUse() || !N1.hasOneUse())
32449     return SDValue();
32450
32451   SDValue ShAmt0 = N0.getOperand(1);
32452   if (ShAmt0.getValueType() != MVT::i8)
32453     return SDValue();
32454   SDValue ShAmt1 = N1.getOperand(1);
32455   if (ShAmt1.getValueType() != MVT::i8)
32456     return SDValue();
32457   if (ShAmt0.getOpcode() == ISD::TRUNCATE)
32458     ShAmt0 = ShAmt0.getOperand(0);
32459   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32460     ShAmt1 = ShAmt1.getOperand(0);
32461
32462   SDLoc DL(N);
32463   unsigned Opc = X86ISD::SHLD;
32464   SDValue Op0 = N0.getOperand(0);
32465   SDValue Op1 = N1.getOperand(0);
32466   if (ShAmt0.getOpcode() == ISD::SUB ||
32467       ShAmt0.getOpcode() == ISD::XOR) {
32468     Opc = X86ISD::SHRD;
32469     std::swap(Op0, Op1);
32470     std::swap(ShAmt0, ShAmt1);
32471   }
32472
32473   // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32474   // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32475   // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32476   // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32477   unsigned Bits = VT.getSizeInBits();
32478   if (ShAmt1.getOpcode() == ISD::SUB) {
32479     SDValue Sum = ShAmt1.getOperand(0);
32480     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
32481       SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32482       if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32483         ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32484       if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32485         return DAG.getNode(Opc, DL, VT,
32486                            Op0, Op1,
32487                            DAG.getNode(ISD::TRUNCATE, DL,
32488                                        MVT::i8, ShAmt0));
32489     }
32490   } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32491     ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32492     if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32493       return DAG.getNode(Opc, DL, VT,
32494                          N0.getOperand(0), N1.getOperand(0),
32495                          DAG.getNode(ISD::TRUNCATE, DL,
32496                                        MVT::i8, ShAmt0));
32497   } else if (ShAmt1.getOpcode() == ISD::XOR) {
32498     SDValue Mask = ShAmt1.getOperand(1);
32499     if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32500       unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32501       SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32502       if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32503         ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32504       if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32505         if (Op1.getOpcode() == InnerShift &&
32506             isa<ConstantSDNode>(Op1.getOperand(1)) &&
32507             Op1.getConstantOperandVal(1) == 1) {
32508           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32509                              DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32510         }
32511         // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32512         if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32513             Op1.getOperand(0) == Op1.getOperand(1)) {
32514           return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32515                      DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32516         }
32517       }
32518     }
32519   }
32520
32521   return SDValue();
32522 }
32523
32524 /// Generate NEG and CMOV for integer abs.
32525 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32526   EVT VT = N->getValueType(0);
32527
32528   // Since X86 does not have CMOV for 8-bit integer, we don't convert
32529   // 8-bit integer abs to NEG and CMOV.
32530   if (VT.isInteger() && VT.getSizeInBits() == 8)
32531     return SDValue();
32532
32533   SDValue N0 = N->getOperand(0);
32534   SDValue N1 = N->getOperand(1);
32535   SDLoc DL(N);
32536
32537   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32538   // and change it to SUB and CMOV.
32539   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32540       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32541       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32542     auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32543     if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32544       // Generate SUB & CMOV.
32545       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32546                                 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32547       SDValue Ops[] = {N0.getOperand(0), Neg,
32548                        DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32549                        SDValue(Neg.getNode(), 1)};
32550       return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32551     }
32552   }
32553   return SDValue();
32554 }
32555
32556 /// Try to turn tests against the signbit in the form of:
32557 ///   XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32558 /// into:
32559 ///   SETGT(X, -1)
32560 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32561   // This is only worth doing if the output type is i8 or i1.
32562   EVT ResultType = N->getValueType(0);
32563   if (ResultType != MVT::i8 && ResultType != MVT::i1)
32564     return SDValue();
32565
32566   SDValue N0 = N->getOperand(0);
32567   SDValue N1 = N->getOperand(1);
32568
32569   // We should be performing an xor against a truncated shift.
32570   if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32571     return SDValue();
32572
32573   // Make sure we are performing an xor against one.
32574   if (!isOneConstant(N1))
32575     return SDValue();
32576
32577   // SetCC on x86 zero extends so only act on this if it's a logical shift.
32578   SDValue Shift = N0.getOperand(0);
32579   if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32580     return SDValue();
32581
32582   // Make sure we are truncating from one of i16, i32 or i64.
32583   EVT ShiftTy = Shift.getValueType();
32584   if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32585     return SDValue();
32586
32587   // Make sure the shift amount extracts the sign bit.
32588   if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32589       Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32590     return SDValue();
32591
32592   // Create a greater-than comparison against -1.
32593   // N.B. Using SETGE against 0 works but we want a canonical looking
32594   // comparison, using SETGT matches up with what TranslateX86CC.
32595   SDLoc DL(N);
32596   SDValue ShiftOp = Shift.getOperand(0);
32597   EVT ShiftOpTy = ShiftOp.getValueType();
32598   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32599   EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32600                                                *DAG.getContext(), ResultType);
32601   SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32602                               DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32603   if (SetCCResultType != ResultType)
32604     Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32605   return Cond;
32606 }
32607
32608 /// Turn vector tests of the signbit in the form of:
32609 ///   xor (sra X, elt_size(X)-1), -1
32610 /// into:
32611 ///   pcmpgt X, -1
32612 ///
32613 /// This should be called before type legalization because the pattern may not
32614 /// persist after that.
32615 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32616                                          const X86Subtarget &Subtarget) {
32617   EVT VT = N->getValueType(0);
32618   if (!VT.isSimple())
32619     return SDValue();
32620
32621   switch (VT.getSimpleVT().SimpleTy) {
32622   default: return SDValue();
32623   case MVT::v16i8:
32624   case MVT::v8i16:
32625   case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32626   case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32627   case MVT::v32i8:
32628   case MVT::v16i16:
32629   case MVT::v8i32:
32630   case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32631   }
32632
32633   // There must be a shift right algebraic before the xor, and the xor must be a
32634   // 'not' operation.
32635   SDValue Shift = N->getOperand(0);
32636   SDValue Ones = N->getOperand(1);
32637   if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32638       !ISD::isBuildVectorAllOnes(Ones.getNode()))
32639     return SDValue();
32640
32641   // The shift should be smearing the sign bit across each vector element.
32642   auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32643   if (!ShiftBV)
32644     return SDValue();
32645
32646   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32647   auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32648   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32649     return SDValue();
32650
32651   // Create a greater-than comparison against -1. We don't use the more obvious
32652   // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32653   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32654 }
32655
32656 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32657 /// is valid for the given \p Subtarget.
32658 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32659                                         const X86Subtarget &Subtarget) {
32660   if (!Subtarget.hasAVX512())
32661     return false;
32662
32663   // FIXME: Scalar type may be supported if we move it to vector register.
32664   if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32665     return false;
32666
32667   EVT SrcElVT = SrcVT.getScalarType();
32668   EVT DstElVT = DstVT.getScalarType();
32669   if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32670     return false;
32671   if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32672     return false;
32673   if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32674     return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32675   return false;
32676 }
32677
32678 /// Detect a pattern of truncation with saturation:
32679 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32680 /// Return the source value to be truncated or SDValue() if the pattern was not
32681 /// matched.
32682 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32683   if (In.getOpcode() != ISD::UMIN)
32684     return SDValue();
32685
32686   //Saturation with truncation. We truncate from InVT to VT.
32687   assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32688     "Unexpected types for truncate operation");
32689
32690   APInt C;
32691   if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C,
32692                                  /*AllowShrink*/false)) {
32693     // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32694     // the element size of the destination type.
32695     return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32696       SDValue();
32697   }
32698   return SDValue();
32699 }
32700
32701 /// Detect a pattern of truncation with saturation:
32702 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32703 /// The types should allow to use VPMOVUS* instruction on AVX512.
32704 /// Return the source value to be truncated or SDValue() if the pattern was not
32705 /// matched.
32706 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32707                                        const X86Subtarget &Subtarget) {
32708   if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32709     return SDValue();
32710   return detectUSatPattern(In, VT);
32711 }
32712
32713 static SDValue
32714 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32715                         const X86Subtarget &Subtarget) {
32716   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32717   if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32718     return SDValue();
32719   if (auto USatVal = detectUSatPattern(In, VT))
32720     if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32721       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32722   return SDValue();
32723 }
32724
32725 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32726 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32727 /// X86ISD::AVG instruction.
32728 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32729                                 const X86Subtarget &Subtarget,
32730                                 const SDLoc &DL) {
32731   if (!VT.isVector() || !VT.isSimple())
32732     return SDValue();
32733   EVT InVT = In.getValueType();
32734   unsigned NumElems = VT.getVectorNumElements();
32735
32736   EVT ScalarVT = VT.getVectorElementType();
32737   if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32738         isPowerOf2_32(NumElems)))
32739     return SDValue();
32740
32741   // InScalarVT is the intermediate type in AVG pattern and it should be greater
32742   // than the original input type (i8/i16).
32743   EVT InScalarVT = InVT.getVectorElementType();
32744   if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32745     return SDValue();
32746
32747   if (!Subtarget.hasSSE2())
32748     return SDValue();
32749   if (Subtarget.hasBWI()) {
32750     if (VT.getSizeInBits() > 512)
32751       return SDValue();
32752   } else if (Subtarget.hasAVX2()) {
32753     if (VT.getSizeInBits() > 256)
32754       return SDValue();
32755   } else {
32756     if (VT.getSizeInBits() > 128)
32757       return SDValue();
32758   }
32759
32760   // Detect the following pattern:
32761   //
32762   //   %1 = zext <N x i8> %a to <N x i32>
32763   //   %2 = zext <N x i8> %b to <N x i32>
32764   //   %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32765   //   %4 = add nuw nsw <N x i32> %3, %2
32766   //   %5 = lshr <N x i32> %N, <i32 1 x N>
32767   //   %6 = trunc <N x i32> %5 to <N x i8>
32768   //
32769   // In AVX512, the last instruction can also be a trunc store.
32770
32771   if (In.getOpcode() != ISD::SRL)
32772     return SDValue();
32773
32774   // A lambda checking the given SDValue is a constant vector and each element
32775   // is in the range [Min, Max].
32776   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32777     BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32778     if (!BV || !BV->isConstant())
32779       return false;
32780     for (SDValue Op : V->ops()) {
32781       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32782       if (!C)
32783         return false;
32784       uint64_t Val = C->getZExtValue();
32785       if (Val < Min || Val > Max)
32786         return false;
32787     }
32788     return true;
32789   };
32790
32791   // Check if each element of the vector is left-shifted by one.
32792   auto LHS = In.getOperand(0);
32793   auto RHS = In.getOperand(1);
32794   if (!IsConstVectorInRange(RHS, 1, 1))
32795     return SDValue();
32796   if (LHS.getOpcode() != ISD::ADD)
32797     return SDValue();
32798
32799   // Detect a pattern of a + b + 1 where the order doesn't matter.
32800   SDValue Operands[3];
32801   Operands[0] = LHS.getOperand(0);
32802   Operands[1] = LHS.getOperand(1);
32803
32804   // Take care of the case when one of the operands is a constant vector whose
32805   // element is in the range [1, 256].
32806   if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32807       Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32808       Operands[0].getOperand(0).getValueType() == VT) {
32809     // The pattern is detected. Subtract one from the constant vector, then
32810     // demote it and emit X86ISD::AVG instruction.
32811     SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32812     Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32813     Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32814     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32815                        Operands[1]);
32816   }
32817
32818   if (Operands[0].getOpcode() == ISD::ADD)
32819     std::swap(Operands[0], Operands[1]);
32820   else if (Operands[1].getOpcode() != ISD::ADD)
32821     return SDValue();
32822   Operands[2] = Operands[1].getOperand(0);
32823   Operands[1] = Operands[1].getOperand(1);
32824
32825   // Now we have three operands of two additions. Check that one of them is a
32826   // constant vector with ones, and the other two are promoted from i8/i16.
32827   for (int i = 0; i < 3; ++i) {
32828     if (!IsConstVectorInRange(Operands[i], 1, 1))
32829       continue;
32830     std::swap(Operands[i], Operands[2]);
32831
32832     // Check if Operands[0] and Operands[1] are results of type promotion.
32833     for (int j = 0; j < 2; ++j)
32834       if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32835           Operands[j].getOperand(0).getValueType() != VT)
32836         return SDValue();
32837
32838     // The pattern is detected, emit X86ISD::AVG instruction.
32839     return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32840                        Operands[1].getOperand(0));
32841   }
32842
32843   return SDValue();
32844 }
32845
32846 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32847                            TargetLowering::DAGCombinerInfo &DCI,
32848                            const X86Subtarget &Subtarget) {
32849   LoadSDNode *Ld = cast<LoadSDNode>(N);
32850   EVT RegVT = Ld->getValueType(0);
32851   EVT MemVT = Ld->getMemoryVT();
32852   SDLoc dl(Ld);
32853   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32854
32855   // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32856   // into two 16-byte operations. Also split non-temporal aligned loads on
32857   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32858   ISD::LoadExtType Ext = Ld->getExtensionType();
32859   bool Fast;
32860   unsigned AddressSpace = Ld->getAddressSpace();
32861   unsigned Alignment = Ld->getAlignment();
32862   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32863       Ext == ISD::NON_EXTLOAD &&
32864       ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
32865        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32866                                AddressSpace, Alignment, &Fast) && !Fast))) {
32867     unsigned NumElems = RegVT.getVectorNumElements();
32868     if (NumElems < 2)
32869       return SDValue();
32870
32871     SDValue Ptr = Ld->getBasePtr();
32872
32873     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32874                                   NumElems/2);
32875     SDValue Load1 =
32876         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32877                     Alignment, Ld->getMemOperand()->getFlags());
32878
32879     Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32880     SDValue Load2 =
32881         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32882                     std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32883     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32884                              Load1.getValue(1),
32885                              Load2.getValue(1));
32886
32887     SDValue NewVec = DAG.getUNDEF(RegVT);
32888     NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32889     NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32890     return DCI.CombineTo(N, NewVec, TF, true);
32891   }
32892
32893   return SDValue();
32894 }
32895
32896 /// If V is a build vector of boolean constants and exactly one of those
32897 /// constants is true, return the operand index of that true element.
32898 /// Otherwise, return -1.
32899 static int getOneTrueElt(SDValue V) {
32900   // This needs to be a build vector of booleans.
32901   // TODO: Checking for the i1 type matches the IR definition for the mask,
32902   // but the mask check could be loosened to i8 or other types. That might
32903   // also require checking more than 'allOnesValue'; eg, the x86 HW
32904   // instructions only require that the MSB is set for each mask element.
32905   // The ISD::MSTORE comments/definition do not specify how the mask operand
32906   // is formatted.
32907   auto *BV = dyn_cast<BuildVectorSDNode>(V);
32908   if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32909     return -1;
32910
32911   int TrueIndex = -1;
32912   unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32913   for (unsigned i = 0; i < NumElts; ++i) {
32914     const SDValue &Op = BV->getOperand(i);
32915     if (Op.isUndef())
32916       continue;
32917     auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32918     if (!ConstNode)
32919       return -1;
32920     if (ConstNode->getAPIntValue().isAllOnesValue()) {
32921       // If we already found a one, this is too many.
32922       if (TrueIndex >= 0)
32923         return -1;
32924       TrueIndex = i;
32925     }
32926   }
32927   return TrueIndex;
32928 }
32929
32930 /// Given a masked memory load/store operation, return true if it has one mask
32931 /// bit set. If it has one mask bit set, then also return the memory address of
32932 /// the scalar element to load/store, the vector index to insert/extract that
32933 /// scalar element, and the alignment for the scalar memory access.
32934 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32935                                          SelectionDAG &DAG, SDValue &Addr,
32936                                          SDValue &Index, unsigned &Alignment) {
32937   int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32938   if (TrueMaskElt < 0)
32939     return false;
32940
32941   // Get the address of the one scalar element that is specified by the mask
32942   // using the appropriate offset from the base pointer.
32943   EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32944   Addr = MaskedOp->getBasePtr();
32945   if (TrueMaskElt != 0) {
32946     unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32947     Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32948   }
32949
32950   Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32951   Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32952   return true;
32953 }
32954
32955 /// If exactly one element of the mask is set for a non-extending masked load,
32956 /// it is a scalar load and vector insert.
32957 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32958 /// mask have already been optimized in IR, so we don't bother with those here.
32959 static SDValue
32960 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32961                              TargetLowering::DAGCombinerInfo &DCI) {
32962   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32963   // However, some target hooks may need to be added to know when the transform
32964   // is profitable. Endianness would also have to be considered.
32965
32966   SDValue Addr, VecIndex;
32967   unsigned Alignment;
32968   if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32969     return SDValue();
32970
32971   // Load the one scalar element that is specified by the mask using the
32972   // appropriate offset from the base pointer.
32973   SDLoc DL(ML);
32974   EVT VT = ML->getValueType(0);
32975   EVT EltVT = VT.getVectorElementType();
32976   SDValue Load =
32977       DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32978                   Alignment, ML->getMemOperand()->getFlags());
32979
32980   // Insert the loaded element into the appropriate place in the vector.
32981   SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32982                                Load, VecIndex);
32983   return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32984 }
32985
32986 static SDValue
32987 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32988                               TargetLowering::DAGCombinerInfo &DCI) {
32989   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32990     return SDValue();
32991
32992   SDLoc DL(ML);
32993   EVT VT = ML->getValueType(0);
32994
32995   // If we are loading the first and last elements of a vector, it is safe and
32996   // always faster to load the whole vector. Replace the masked load with a
32997   // vector load and select.
32998   unsigned NumElts = VT.getVectorNumElements();
32999   BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
33000   bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
33001   bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
33002   if (LoadFirstElt && LoadLastElt) {
33003     SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33004                                 ML->getMemOperand());
33005     SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
33006     return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
33007   }
33008
33009   // Convert a masked load with a constant mask into a masked load and a select.
33010   // This allows the select operation to use a faster kind of select instruction
33011   // (for example, vblendvps -> vblendps).
33012
33013   // Don't try this if the pass-through operand is already undefined. That would
33014   // cause an infinite loop because that's what we're about to create.
33015   if (ML->getSrc0().isUndef())
33016     return SDValue();
33017
33018   // The new masked load has an undef pass-through operand. The select uses the
33019   // original pass-through operand.
33020   SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33021                                     ML->getMask(), DAG.getUNDEF(VT),
33022                                     ML->getMemoryVT(), ML->getMemOperand(),
33023                                     ML->getExtensionType());
33024   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
33025
33026   return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
33027 }
33028
33029 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
33030                                  TargetLowering::DAGCombinerInfo &DCI,
33031                                  const X86Subtarget &Subtarget) {
33032   MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
33033
33034   // TODO: Expanding load with constant mask may be optimized as well.
33035   if (Mld->isExpandingLoad())
33036     return SDValue();
33037
33038   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
33039     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
33040       return ScalarLoad;
33041     // TODO: Do some AVX512 subsets benefit from this transform?
33042     if (!Subtarget.hasAVX512())
33043       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
33044         return Blend;
33045   }
33046
33047   if (Mld->getExtensionType() != ISD::SEXTLOAD)
33048     return SDValue();
33049
33050   // Resolve extending loads.
33051   EVT VT = Mld->getValueType(0);
33052   unsigned NumElems = VT.getVectorNumElements();
33053   EVT LdVT = Mld->getMemoryVT();
33054   SDLoc dl(Mld);
33055
33056   assert(LdVT != VT && "Cannot extend to the same type");
33057   unsigned ToSz = VT.getScalarSizeInBits();
33058   unsigned FromSz = LdVT.getScalarSizeInBits();
33059   // From/To sizes and ElemCount must be pow of two.
33060   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33061     "Unexpected size for extending masked load");
33062
33063   unsigned SizeRatio  = ToSz / FromSz;
33064   assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
33065
33066   // Create a type on which we perform the shuffle.
33067   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33068           LdVT.getScalarType(), NumElems*SizeRatio);
33069   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33070
33071   // Convert Src0 value.
33072   SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
33073   if (!Mld->getSrc0().isUndef()) {
33074     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33075     for (unsigned i = 0; i != NumElems; ++i)
33076       ShuffleVec[i] = i * SizeRatio;
33077
33078     // Can't shuffle using an illegal type.
33079     assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33080            "WideVecVT should be legal");
33081     WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33082                                     DAG.getUNDEF(WideVecVT), ShuffleVec);
33083   }
33084   // Prepare the new mask.
33085   SDValue NewMask;
33086   SDValue Mask = Mld->getMask();
33087   if (Mask.getValueType() == VT) {
33088     // Mask and original value have the same type.
33089     NewMask = DAG.getBitcast(WideVecVT, Mask);
33090     SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33091     for (unsigned i = 0; i != NumElems; ++i)
33092       ShuffleVec[i] = i * SizeRatio;
33093     for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
33094       ShuffleVec[i] = NumElems * SizeRatio;
33095     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33096                                    DAG.getConstant(0, dl, WideVecVT),
33097                                    ShuffleVec);
33098   } else {
33099     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33100     unsigned WidenNumElts = NumElems*SizeRatio;
33101     unsigned MaskNumElts = VT.getVectorNumElements();
33102     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
33103                                      WidenNumElts);
33104
33105     unsigned NumConcat = WidenNumElts / MaskNumElts;
33106     SmallVector<SDValue, 16> Ops(NumConcat);
33107     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33108     Ops[0] = Mask;
33109     for (unsigned i = 1; i != NumConcat; ++i)
33110       Ops[i] = ZeroVal;
33111
33112     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33113   }
33114
33115   SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33116                                      Mld->getBasePtr(), NewMask, WideSrc0,
33117                                      Mld->getMemoryVT(), Mld->getMemOperand(),
33118                                      ISD::NON_EXTLOAD);
33119   SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33120   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33121 }
33122
33123 /// If exactly one element of the mask is set for a non-truncating masked store,
33124 /// it is a vector extract and scalar store.
33125 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33126 /// mask have already been optimized in IR, so we don't bother with those here.
33127 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33128                                               SelectionDAG &DAG) {
33129   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33130   // However, some target hooks may need to be added to know when the transform
33131   // is profitable. Endianness would also have to be considered.
33132
33133   SDValue Addr, VecIndex;
33134   unsigned Alignment;
33135   if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33136     return SDValue();
33137
33138   // Extract the one scalar element that is actually being stored.
33139   SDLoc DL(MS);
33140   EVT VT = MS->getValue().getValueType();
33141   EVT EltVT = VT.getVectorElementType();
33142   SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33143                                 MS->getValue(), VecIndex);
33144
33145   // Store that element at the appropriate offset from the base pointer.
33146   return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33147                       Alignment, MS->getMemOperand()->getFlags());
33148 }
33149
33150 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33151                                   const X86Subtarget &Subtarget) {
33152   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
33153
33154   if (Mst->isCompressingStore())
33155     return SDValue();
33156
33157   if (!Mst->isTruncatingStore())
33158     return reduceMaskedStoreToScalarStore(Mst, DAG);
33159
33160   // Resolve truncating stores.
33161   EVT VT = Mst->getValue().getValueType();
33162   unsigned NumElems = VT.getVectorNumElements();
33163   EVT StVT = Mst->getMemoryVT();
33164   SDLoc dl(Mst);
33165
33166   assert(StVT != VT && "Cannot truncate to the same type");
33167   unsigned FromSz = VT.getScalarSizeInBits();
33168   unsigned ToSz = StVT.getScalarSizeInBits();
33169
33170   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33171
33172   // The truncating store is legal in some cases. For example
33173   // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33174   // are designated for truncate store.
33175   // In this case we don't need any further transformations.
33176   if (TLI.isTruncStoreLegal(VT, StVT))
33177     return SDValue();
33178
33179   // From/To sizes and ElemCount must be pow of two.
33180   assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33181     "Unexpected size for truncating masked store");
33182   // We are going to use the original vector elt for storing.
33183   // Accumulated smaller vector elements must be a multiple of the store size.
33184   assert (((NumElems * FromSz) % ToSz) == 0 &&
33185           "Unexpected ratio for truncating masked store");
33186
33187   unsigned SizeRatio  = FromSz / ToSz;
33188   assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33189
33190   // Create a type on which we perform the shuffle.
33191   EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33192           StVT.getScalarType(), NumElems*SizeRatio);
33193
33194   assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33195
33196   SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33197   SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33198   for (unsigned i = 0; i != NumElems; ++i)
33199     ShuffleVec[i] = i * SizeRatio;
33200
33201   // Can't shuffle using an illegal type.
33202   assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33203          "WideVecVT should be legal");
33204
33205   SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33206                                               DAG.getUNDEF(WideVecVT),
33207                                               ShuffleVec);
33208
33209   SDValue NewMask;
33210   SDValue Mask = Mst->getMask();
33211   if (Mask.getValueType() == VT) {
33212     // Mask and original value have the same type.
33213     NewMask = DAG.getBitcast(WideVecVT, Mask);
33214     for (unsigned i = 0; i != NumElems; ++i)
33215       ShuffleVec[i] = i * SizeRatio;
33216     for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
33217       ShuffleVec[i] = NumElems*SizeRatio;
33218     NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33219                                    DAG.getConstant(0, dl, WideVecVT),
33220                                    ShuffleVec);
33221   } else {
33222     assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33223     unsigned WidenNumElts = NumElems*SizeRatio;
33224     unsigned MaskNumElts = VT.getVectorNumElements();
33225     EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(),  MVT::i1,
33226                                      WidenNumElts);
33227
33228     unsigned NumConcat = WidenNumElts / MaskNumElts;
33229     SmallVector<SDValue, 16> Ops(NumConcat);
33230     SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33231     Ops[0] = Mask;
33232     for (unsigned i = 1; i != NumConcat; ++i)
33233       Ops[i] = ZeroVal;
33234
33235     NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33236   }
33237
33238   return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33239                             Mst->getBasePtr(), NewMask, StVT,
33240                             Mst->getMemOperand(), false);
33241 }
33242
33243 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33244                             const X86Subtarget &Subtarget) {
33245   StoreSDNode *St = cast<StoreSDNode>(N);
33246   EVT VT = St->getValue().getValueType();
33247   EVT StVT = St->getMemoryVT();
33248   SDLoc dl(St);
33249   SDValue StoredVal = St->getOperand(1);
33250   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33251
33252   // If we are saving a concatenation of two XMM registers and 32-byte stores
33253   // are slow, such as on Sandy Bridge, perform two 16-byte stores.
33254   bool Fast;
33255   unsigned AddressSpace = St->getAddressSpace();
33256   unsigned Alignment = St->getAlignment();
33257   if (VT.is256BitVector() && StVT == VT &&
33258       TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33259                              AddressSpace, Alignment, &Fast) &&
33260       !Fast) {
33261     unsigned NumElems = VT.getVectorNumElements();
33262     if (NumElems < 2)
33263       return SDValue();
33264
33265     SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33266     SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33267
33268     SDValue Ptr0 = St->getBasePtr();
33269     SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33270
33271     SDValue Ch0 =
33272         DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33273                      Alignment, St->getMemOperand()->getFlags());
33274     SDValue Ch1 =
33275         DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33276                      std::min(16U, Alignment), St->getMemOperand()->getFlags());
33277     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33278   }
33279
33280   // Optimize trunc store (of multiple scalars) to shuffle and store.
33281   // First, pack all of the elements in one place. Next, store to memory
33282   // in fewer chunks.
33283   if (St->isTruncatingStore() && VT.isVector()) {
33284     // Check if we can detect an AVG pattern from the truncation. If yes,
33285     // replace the trunc store by a normal store with the result of X86ISD::AVG
33286     // instruction.
33287     if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33288                                        Subtarget, dl))
33289       return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33290                           St->getPointerInfo(), St->getAlignment(),
33291                           St->getMemOperand()->getFlags());
33292
33293     if (SDValue Val =
33294         detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33295       return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33296                              dl, Val, St->getBasePtr(),
33297                              St->getMemoryVT(), St->getMemOperand(), DAG);
33298
33299     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33300     unsigned NumElems = VT.getVectorNumElements();
33301     assert(StVT != VT && "Cannot truncate to the same type");
33302     unsigned FromSz = VT.getScalarSizeInBits();
33303     unsigned ToSz = StVT.getScalarSizeInBits();
33304
33305     // The truncating store is legal in some cases. For example
33306     // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33307     // are designated for truncate store.
33308     // In this case we don't need any further transformations.
33309     if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33310       return SDValue();
33311
33312     // From, To sizes and ElemCount must be pow of two
33313     if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
33314     // We are going to use the original vector elt for storing.
33315     // Accumulated smaller vector elements must be a multiple of the store size.
33316     if (0 != (NumElems * FromSz) % ToSz) return SDValue();
33317
33318     unsigned SizeRatio  = FromSz / ToSz;
33319
33320     assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33321
33322     // Create a type on which we perform the shuffle
33323     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33324             StVT.getScalarType(), NumElems*SizeRatio);
33325
33326     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33327
33328     SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33329     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33330     for (unsigned i = 0; i != NumElems; ++i)
33331       ShuffleVec[i] = i * SizeRatio;
33332
33333     // Can't shuffle using an illegal type.
33334     if (!TLI.isTypeLegal(WideVecVT))
33335       return SDValue();
33336
33337     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33338                                          DAG.getUNDEF(WideVecVT),
33339                                          ShuffleVec);
33340     // At this point all of the data is stored at the bottom of the
33341     // register. We now need to save it to mem.
33342
33343     // Find the largest store unit
33344     MVT StoreType = MVT::i8;
33345     for (MVT Tp : MVT::integer_valuetypes()) {
33346       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
33347         StoreType = Tp;
33348     }
33349
33350     // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33351     if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
33352         (64 <= NumElems * ToSz))
33353       StoreType = MVT::f64;
33354
33355     // Bitcast the original vector into a vector of store-size units
33356     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33357             StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33358     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
33359     SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33360     SmallVector<SDValue, 8> Chains;
33361     SDValue Ptr = St->getBasePtr();
33362
33363     // Perform one or more big stores into memory.
33364     for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
33365       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33366                                    StoreType, ShuffWide,
33367                                    DAG.getIntPtrConstant(i, dl));
33368       SDValue Ch =
33369           DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33370                        St->getAlignment(), St->getMemOperand()->getFlags());
33371       Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33372       Chains.push_back(Ch);
33373     }
33374
33375     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33376   }
33377
33378   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
33379   // the FP state in cases where an emms may be missing.
33380   // A preferable solution to the general problem is to figure out the right
33381   // places to insert EMMS.  This qualifies as a quick hack.
33382
33383   // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33384   if (VT.getSizeInBits() != 64)
33385     return SDValue();
33386
33387   const Function *F = DAG.getMachineFunction().getFunction();
33388   bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33389   bool F64IsLegal =
33390       !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
33391   if ((VT.isVector() ||
33392        (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
33393       isa<LoadSDNode>(St->getValue()) &&
33394       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
33395       St->getChain().hasOneUse() && !St->isVolatile()) {
33396     SDNode* LdVal = St->getValue().getNode();
33397     LoadSDNode *Ld = nullptr;
33398     int TokenFactorIndex = -1;
33399     SmallVector<SDValue, 8> Ops;
33400     SDNode* ChainVal = St->getChain().getNode();
33401     // Must be a store of a load.  We currently handle two cases:  the load
33402     // is a direct child, and it's under an intervening TokenFactor.  It is
33403     // possible to dig deeper under nested TokenFactors.
33404     if (ChainVal == LdVal)
33405       Ld = cast<LoadSDNode>(St->getChain());
33406     else if (St->getValue().hasOneUse() &&
33407              ChainVal->getOpcode() == ISD::TokenFactor) {
33408       for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
33409         if (ChainVal->getOperand(i).getNode() == LdVal) {
33410           TokenFactorIndex = i;
33411           Ld = cast<LoadSDNode>(St->getValue());
33412         } else
33413           Ops.push_back(ChainVal->getOperand(i));
33414       }
33415     }
33416
33417     if (!Ld || !ISD::isNormalLoad(Ld))
33418       return SDValue();
33419
33420     // If this is not the MMX case, i.e. we are just turning i64 load/store
33421     // into f64 load/store, avoid the transformation if there are multiple
33422     // uses of the loaded value.
33423     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
33424       return SDValue();
33425
33426     SDLoc LdDL(Ld);
33427     SDLoc StDL(N);
33428     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
33429     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33430     // pair instead.
33431     if (Subtarget.is64Bit() || F64IsLegal) {
33432       MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
33433       SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33434                                   Ld->getPointerInfo(), Ld->getAlignment(),
33435                                   Ld->getMemOperand()->getFlags());
33436       // Make sure new load is placed in same chain order.
33437       SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
33438       if (TokenFactorIndex >= 0) {
33439         Ops.push_back(NewChain);
33440         NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33441       }
33442       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33443                           St->getPointerInfo(), St->getAlignment(),
33444                           St->getMemOperand()->getFlags());
33445     }
33446
33447     // Otherwise, lower to two pairs of 32-bit loads / stores.
33448     SDValue LoAddr = Ld->getBasePtr();
33449     SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33450
33451     SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33452                                Ld->getPointerInfo(), Ld->getAlignment(),
33453                                Ld->getMemOperand()->getFlags());
33454     SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33455                                Ld->getPointerInfo().getWithOffset(4),
33456                                MinAlign(Ld->getAlignment(), 4),
33457                                Ld->getMemOperand()->getFlags());
33458     // Make sure new loads are placed in same chain order.
33459     SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
33460     NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
33461
33462     if (TokenFactorIndex >= 0) {
33463       Ops.push_back(NewChain);
33464       NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33465     }
33466
33467     LoAddr = St->getBasePtr();
33468     HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33469
33470     SDValue LoSt =
33471         DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33472                      St->getAlignment(), St->getMemOperand()->getFlags());
33473     SDValue HiSt = DAG.getStore(
33474         NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33475         MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33476     return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33477   }
33478
33479   // This is similar to the above case, but here we handle a scalar 64-bit
33480   // integer store that is extracted from a vector on a 32-bit target.
33481   // If we have SSE2, then we can treat it like a floating-point double
33482   // to get past legalization. The execution dependencies fixup pass will
33483   // choose the optimal machine instruction for the store if this really is
33484   // an integer or v2f32 rather than an f64.
33485   if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
33486       St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
33487     SDValue OldExtract = St->getOperand(1);
33488     SDValue ExtOp0 = OldExtract.getOperand(0);
33489     unsigned VecSize = ExtOp0.getValueSizeInBits();
33490     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33491     SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33492     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33493                                      BitCast, OldExtract.getOperand(1));
33494     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33495                         St->getPointerInfo(), St->getAlignment(),
33496                         St->getMemOperand()->getFlags());
33497   }
33498
33499   return SDValue();
33500 }
33501
33502 /// Return 'true' if this vector operation is "horizontal"
33503 /// and return the operands for the horizontal operation in LHS and RHS.  A
33504 /// horizontal operation performs the binary operation on successive elements
33505 /// of its first operand, then on successive elements of its second operand,
33506 /// returning the resulting values in a vector.  For example, if
33507 ///   A = < float a0, float a1, float a2, float a3 >
33508 /// and
33509 ///   B = < float b0, float b1, float b2, float b3 >
33510 /// then the result of doing a horizontal operation on A and B is
33511 ///   A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33512 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33513 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33514 /// set to A, RHS to B, and the routine returns 'true'.
33515 /// Note that the binary operation should have the property that if one of the
33516 /// operands is UNDEF then the result is UNDEF.
33517 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33518   // Look for the following pattern: if
33519   //   A = < float a0, float a1, float a2, float a3 >
33520   //   B = < float b0, float b1, float b2, float b3 >
33521   // and
33522   //   LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33523   //   RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33524   // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33525   // which is A horizontal-op B.
33526
33527   // At least one of the operands should be a vector shuffle.
33528   if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33529       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33530     return false;
33531
33532   MVT VT = LHS.getSimpleValueType();
33533
33534   assert((VT.is128BitVector() || VT.is256BitVector()) &&
33535          "Unsupported vector type for horizontal add/sub");
33536
33537   // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33538   // operate independently on 128-bit lanes.
33539   unsigned NumElts = VT.getVectorNumElements();
33540   unsigned NumLanes = VT.getSizeInBits()/128;
33541   unsigned NumLaneElts = NumElts / NumLanes;
33542   assert((NumLaneElts % 2 == 0) &&
33543          "Vector type should have an even number of elements in each lane");
33544   unsigned HalfLaneElts = NumLaneElts/2;
33545
33546   // View LHS in the form
33547   //   LHS = VECTOR_SHUFFLE A, B, LMask
33548   // If LHS is not a shuffle then pretend it is the shuffle
33549   //   LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33550   // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33551   // type VT.
33552   SDValue A, B;
33553   SmallVector<int, 16> LMask(NumElts);
33554   if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33555     if (!LHS.getOperand(0).isUndef())
33556       A = LHS.getOperand(0);
33557     if (!LHS.getOperand(1).isUndef())
33558       B = LHS.getOperand(1);
33559     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33560     std::copy(Mask.begin(), Mask.end(), LMask.begin());
33561   } else {
33562     if (!LHS.isUndef())
33563       A = LHS;
33564     for (unsigned i = 0; i != NumElts; ++i)
33565       LMask[i] = i;
33566   }
33567
33568   // Likewise, view RHS in the form
33569   //   RHS = VECTOR_SHUFFLE C, D, RMask
33570   SDValue C, D;
33571   SmallVector<int, 16> RMask(NumElts);
33572   if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33573     if (!RHS.getOperand(0).isUndef())
33574       C = RHS.getOperand(0);
33575     if (!RHS.getOperand(1).isUndef())
33576       D = RHS.getOperand(1);
33577     ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33578     std::copy(Mask.begin(), Mask.end(), RMask.begin());
33579   } else {
33580     if (!RHS.isUndef())
33581       C = RHS;
33582     for (unsigned i = 0; i != NumElts; ++i)
33583       RMask[i] = i;
33584   }
33585
33586   // Check that the shuffles are both shuffling the same vectors.
33587   if (!(A == C && B == D) && !(A == D && B == C))
33588     return false;
33589
33590   // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33591   if (!A.getNode() && !B.getNode())
33592     return false;
33593
33594   // If A and B occur in reverse order in RHS, then "swap" them (which means
33595   // rewriting the mask).
33596   if (A != C)
33597     ShuffleVectorSDNode::commuteMask(RMask);
33598
33599   // At this point LHS and RHS are equivalent to
33600   //   LHS = VECTOR_SHUFFLE A, B, LMask
33601   //   RHS = VECTOR_SHUFFLE A, B, RMask
33602   // Check that the masks correspond to performing a horizontal operation.
33603   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33604     for (unsigned i = 0; i != NumLaneElts; ++i) {
33605       int LIdx = LMask[i+l], RIdx = RMask[i+l];
33606
33607       // Ignore any UNDEF components.
33608       if (LIdx < 0 || RIdx < 0 ||
33609           (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33610           (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33611         continue;
33612
33613       // Check that successive elements are being operated on.  If not, this is
33614       // not a horizontal operation.
33615       unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33616       int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33617       if (!(LIdx == Index && RIdx == Index + 1) &&
33618           !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33619         return false;
33620     }
33621   }
33622
33623   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33624   RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33625   return true;
33626 }
33627
33628 /// Do target-specific dag combines on floating-point adds/subs.
33629 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33630                                const X86Subtarget &Subtarget) {
33631   EVT VT = N->getValueType(0);
33632   SDValue LHS = N->getOperand(0);
33633   SDValue RHS = N->getOperand(1);
33634   bool IsFadd = N->getOpcode() == ISD::FADD;
33635   assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33636
33637   // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33638   if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33639        (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33640       isHorizontalBinOp(LHS, RHS, IsFadd)) {
33641     auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33642     return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33643   }
33644   return SDValue();
33645 }
33646
33647 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33648 /// the codegen.
33649 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33650 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33651                                           const X86Subtarget &Subtarget,
33652                                           SDLoc &DL) {
33653   assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33654   SDValue Src = N->getOperand(0);
33655   unsigned Opcode = Src.getOpcode();
33656   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33657
33658   EVT VT = N->getValueType(0);
33659   EVT SrcVT = Src.getValueType();
33660
33661   auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33662     unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33663
33664     // Repeated operand, so we are only trading one output truncation for
33665     // one input truncation.
33666     if (Op0 == Op1)
33667       return true;
33668
33669     // See if either operand has been extended from a smaller/equal size to
33670     // the truncation size, allowing a truncation to combine with the extend.
33671     unsigned Opcode0 = Op0.getOpcode();
33672     if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33673          Opcode0 == ISD::ZERO_EXTEND) &&
33674         Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33675       return true;
33676
33677     unsigned Opcode1 = Op1.getOpcode();
33678     if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33679          Opcode1 == ISD::ZERO_EXTEND) &&
33680         Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33681       return true;
33682
33683     // See if either operand is a single use constant which can be constant
33684     // folded.
33685     SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33686     SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33687     return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33688            ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33689   };
33690
33691   auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33692     SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33693     SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33694     return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33695   };
33696
33697   // Don't combine if the operation has other uses.
33698   if (!N->isOnlyUserOf(Src.getNode()))
33699     return SDValue();
33700
33701   // Only support vector truncation for now.
33702   // TODO: i64 scalar math would benefit as well.
33703   if (!VT.isVector())
33704     return SDValue();
33705
33706   // In most cases its only worth pre-truncating if we're only facing the cost
33707   // of one truncation.
33708   // i.e. if one of the inputs will constant fold or the input is repeated.
33709   switch (Opcode) {
33710   case ISD::AND:
33711   case ISD::XOR:
33712   case ISD::OR: {
33713     SDValue Op0 = Src.getOperand(0);
33714     SDValue Op1 = Src.getOperand(1);
33715     if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33716         IsRepeatedOpOrFreeTruncation(Op0, Op1))
33717       return TruncateArithmetic(Op0, Op1);
33718     break;
33719   }
33720
33721   case ISD::MUL:
33722     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33723     // better to truncate if we have the chance.
33724     if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33725         !TLI.isOperationLegal(Opcode, SrcVT))
33726       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33727     LLVM_FALLTHROUGH;
33728   case ISD::ADD: {
33729     SDValue Op0 = Src.getOperand(0);
33730     SDValue Op1 = Src.getOperand(1);
33731     if (TLI.isOperationLegal(Opcode, VT) &&
33732         IsRepeatedOpOrFreeTruncation(Op0, Op1))
33733       return TruncateArithmetic(Op0, Op1);
33734     break;
33735   }
33736   }
33737
33738   return SDValue();
33739 }
33740
33741 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33742 static SDValue
33743 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33744                                   SmallVector<SDValue, 8> &Regs) {
33745   assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33746                              Regs[0].getValueType() == MVT::v2i64));
33747   EVT OutVT = N->getValueType(0);
33748   EVT OutSVT = OutVT.getVectorElementType();
33749   EVT InVT = Regs[0].getValueType();
33750   EVT InSVT = InVT.getVectorElementType();
33751   SDLoc DL(N);
33752
33753   // First, use mask to unset all bits that won't appear in the result.
33754   assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33755          "OutSVT can only be either i8 or i16.");
33756   APInt Mask =
33757       APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33758   SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33759   for (auto &Reg : Regs)
33760     Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33761
33762   MVT UnpackedVT, PackedVT;
33763   if (OutSVT == MVT::i8) {
33764     UnpackedVT = MVT::v8i16;
33765     PackedVT = MVT::v16i8;
33766   } else {
33767     UnpackedVT = MVT::v4i32;
33768     PackedVT = MVT::v8i16;
33769   }
33770
33771   // In each iteration, truncate the type by a half size.
33772   auto RegNum = Regs.size();
33773   for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33774        j < e; j *= 2, RegNum /= 2) {
33775     for (unsigned i = 0; i < RegNum; i++)
33776       Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33777     for (unsigned i = 0; i < RegNum / 2; i++)
33778       Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33779                             Regs[i * 2 + 1]);
33780   }
33781
33782   // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33783   // then extract a subvector as the result since v8i8 is not a legal type.
33784   if (OutVT == MVT::v8i8) {
33785     Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33786     Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33787                           DAG.getIntPtrConstant(0, DL));
33788     return Regs[0];
33789   } else if (RegNum > 1) {
33790     Regs.resize(RegNum);
33791     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33792   } else
33793     return Regs[0];
33794 }
33795
33796 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33797 static SDValue
33798 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33799                                   SelectionDAG &DAG,
33800                                   SmallVector<SDValue, 8> &Regs) {
33801   assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33802   EVT OutVT = N->getValueType(0);
33803   SDLoc DL(N);
33804
33805   // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33806   SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33807   for (auto &Reg : Regs) {
33808     Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33809                               Subtarget, DAG);
33810     Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33811                               Subtarget, DAG);
33812   }
33813
33814   for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33815     Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33816                           Regs[i * 2 + 1]);
33817
33818   if (Regs.size() > 2) {
33819     Regs.resize(Regs.size() / 2);
33820     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33821   } else
33822     return Regs[0];
33823 }
33824
33825 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33826 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33827 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33828 /// element that is extracted from a vector and then truncated, and it is
33829 /// difficult to do this optimization based on them.
33830 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33831                                        const X86Subtarget &Subtarget) {
33832   EVT OutVT = N->getValueType(0);
33833   if (!OutVT.isVector())
33834     return SDValue();
33835
33836   SDValue In = N->getOperand(0);
33837   if (!In.getValueType().isSimple())
33838     return SDValue();
33839
33840   EVT InVT = In.getValueType();
33841   unsigned NumElems = OutVT.getVectorNumElements();
33842
33843   // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33844   // SSE2, and we need to take care of it specially.
33845   // AVX512 provides vpmovdb.
33846   if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33847     return SDValue();
33848
33849   EVT OutSVT = OutVT.getVectorElementType();
33850   EVT InSVT = InVT.getVectorElementType();
33851   if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33852         (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33853         NumElems >= 8))
33854     return SDValue();
33855
33856   // SSSE3's pshufb results in less instructions in the cases below.
33857   if (Subtarget.hasSSSE3() && NumElems == 8 &&
33858       ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33859        (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33860     return SDValue();
33861
33862   SDLoc DL(N);
33863
33864   // Split a long vector into vectors of legal type.
33865   unsigned RegNum = InVT.getSizeInBits() / 128;
33866   SmallVector<SDValue, 8> SubVec(RegNum);
33867   unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33868   EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33869
33870   for (unsigned i = 0; i < RegNum; i++)
33871     SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33872                             DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33873
33874   // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33875   // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33876   // truncate 2 x v4i32 to v8i16.
33877   if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33878     return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33879   else if (InSVT == MVT::i32)
33880     return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33881   else
33882     return SDValue();
33883 }
33884
33885 /// This function transforms vector truncation of 'all or none' bits values.
33886 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33887 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33888                                                SelectionDAG &DAG,
33889                                                const X86Subtarget &Subtarget) {
33890   // Requires SSE2 but AVX512 has fast truncate.
33891   if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33892     return SDValue();
33893
33894   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33895     return SDValue();
33896
33897   SDValue In = N->getOperand(0);
33898   if (!In.getValueType().isSimple())
33899     return SDValue();
33900
33901   MVT VT = N->getValueType(0).getSimpleVT();
33902   MVT SVT = VT.getScalarType();
33903
33904   MVT InVT = In.getValueType().getSimpleVT();
33905   MVT InSVT = InVT.getScalarType();
33906
33907   // Use PACKSS if the input is a splatted sign bit.
33908   // e.g. Comparison result, sext_in_reg, etc.
33909   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33910   if (NumSignBits != InSVT.getSizeInBits())
33911     return SDValue();
33912
33913   // Check we have a truncation suited for PACKSS.
33914   if (!VT.is128BitVector() && !VT.is256BitVector())
33915     return SDValue();
33916   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33917     return SDValue();
33918   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33919     return SDValue();
33920
33921   return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33922 }
33923
33924 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33925                                const X86Subtarget &Subtarget) {
33926   EVT VT = N->getValueType(0);
33927   SDValue Src = N->getOperand(0);
33928   SDLoc DL(N);
33929
33930   // Attempt to pre-truncate inputs to arithmetic ops instead.
33931   if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33932     return V;
33933
33934   // Try to detect AVG pattern first.
33935   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33936     return Avg;
33937
33938   // Try to combine truncation with unsigned saturation.
33939   if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33940     return Val;
33941
33942   // The bitcast source is a direct mmx result.
33943   // Detect bitcasts between i32 to x86mmx
33944   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33945     SDValue BCSrc = Src.getOperand(0);
33946     if (BCSrc.getValueType() == MVT::x86mmx)
33947       return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33948   }
33949
33950   // Try to truncate extended sign bits with PACKSS.
33951   if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33952     return V;
33953
33954   return combineVectorTruncation(N, DAG, Subtarget);
33955 }
33956
33957 /// Returns the negated value if the node \p N flips sign of FP value.
33958 ///
33959 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33960 /// AVX512F does not have FXOR, so FNEG is lowered as
33961 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33962 /// In this case we go though all bitcasts.
33963 static SDValue isFNEG(SDNode *N) {
33964   if (N->getOpcode() == ISD::FNEG)
33965     return N->getOperand(0);
33966
33967   SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33968   if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33969     return SDValue();
33970
33971   SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33972   if (!Op1.getValueType().isFloatingPoint())
33973     return SDValue();
33974
33975   SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33976
33977   unsigned EltBits = Op1.getScalarValueSizeInBits();
33978   auto isSignMask = [&](const ConstantFP *C) {
33979     return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33980   };
33981
33982   // There is more than one way to represent the same constant on
33983   // the different X86 targets. The type of the node may also depend on size.
33984   //  - load scalar value and broadcast
33985   //  - BUILD_VECTOR node
33986   //  - load from a constant pool.
33987   // We check all variants here.
33988   if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33989     if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33990       if (isSignMask(cast<ConstantFP>(C)))
33991         return Op0;
33992
33993   } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33994     if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33995       if (isSignMask(CN->getConstantFPValue()))
33996         return Op0;
33997
33998   } else if (auto *C = getTargetConstantFromNode(Op1)) {
33999     if (C->getType()->isVectorTy()) {
34000       if (auto *SplatV = C->getSplatValue())
34001         if (isSignMask(cast<ConstantFP>(SplatV)))
34002           return Op0;
34003     } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
34004       if (isSignMask(FPConst))
34005         return Op0;
34006   }
34007   return SDValue();
34008 }
34009
34010 /// Do target-specific dag combines on floating point negations.
34011 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
34012                            const X86Subtarget &Subtarget) {
34013   EVT OrigVT = N->getValueType(0);
34014   SDValue Arg = isFNEG(N);
34015   assert(Arg.getNode() && "N is expected to be an FNEG node");
34016
34017   EVT VT = Arg.getValueType();
34018   EVT SVT = VT.getScalarType();
34019   SDLoc DL(N);
34020
34021   // Let legalize expand this if it isn't a legal type yet.
34022   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34023     return SDValue();
34024
34025   // If we're negating a FMUL node on a target with FMA, then we can avoid the
34026   // use of a constant by performing (-0 - A*B) instead.
34027   // FIXME: Check rounding control flags as well once it becomes available.
34028   if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
34029       Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
34030     SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
34031     SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
34032                                   Arg.getOperand(1), Zero);
34033     return DAG.getBitcast(OrigVT, NewNode);
34034   }
34035
34036   // If we're negating an FMA node, then we can adjust the
34037   // instruction to include the extra negation.
34038   unsigned NewOpcode = 0;
34039   if (Arg.hasOneUse()) {
34040     switch (Arg.getOpcode()) {
34041     case X86ISD::FMADD:        NewOpcode = X86ISD::FNMSUB;       break;
34042     case X86ISD::FMSUB:        NewOpcode = X86ISD::FNMADD;       break;
34043     case X86ISD::FNMADD:       NewOpcode = X86ISD::FMSUB;        break;
34044     case X86ISD::FNMSUB:       NewOpcode = X86ISD::FMADD;        break;
34045     case X86ISD::FMADD_RND:    NewOpcode = X86ISD::FNMSUB_RND;   break;
34046     case X86ISD::FMSUB_RND:    NewOpcode = X86ISD::FNMADD_RND;   break;
34047     case X86ISD::FNMADD_RND:   NewOpcode = X86ISD::FMSUB_RND;    break;
34048     case X86ISD::FNMSUB_RND:   NewOpcode = X86ISD::FMADD_RND;    break;
34049     // We can't handle scalar intrinsic node here because it would only
34050     // invert one element and not the whole vector. But we could try to handle
34051     // a negation of the lower element only.
34052     }
34053   }
34054   if (NewOpcode)
34055     return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
34056                                               Arg.getNode()->ops()));
34057
34058   return SDValue();
34059 }
34060
34061 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
34062                                  const X86Subtarget &Subtarget) {
34063   MVT VT = N->getSimpleValueType(0);
34064   // If we have integer vector types available, use the integer opcodes.
34065   if (VT.isVector() && Subtarget.hasSSE2()) {
34066     SDLoc dl(N);
34067
34068     MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
34069
34070     SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
34071     SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
34072     unsigned IntOpcode;
34073     switch (N->getOpcode()) {
34074     default: llvm_unreachable("Unexpected FP logic op");
34075     case X86ISD::FOR: IntOpcode = ISD::OR; break;
34076     case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34077     case X86ISD::FAND: IntOpcode = ISD::AND; break;
34078     case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34079     }
34080     SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34081     return DAG.getBitcast(VT, IntOp);
34082   }
34083   return SDValue();
34084 }
34085
34086 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34087                           TargetLowering::DAGCombinerInfo &DCI,
34088                           const X86Subtarget &Subtarget) {
34089   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
34090     return Cmp;
34091
34092   if (DCI.isBeforeLegalizeOps())
34093     return SDValue();
34094
34095   if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
34096     return RV;
34097
34098   if (Subtarget.hasCMov())
34099     if (SDValue RV = combineIntegerAbs(N, DAG))
34100       return RV;
34101
34102   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34103     return FPLogic;
34104
34105   if (isFNEG(N))
34106     return combineFneg(N, DAG, Subtarget);
34107   return SDValue();
34108 }
34109
34110
34111 static bool isNullFPScalarOrVectorConst(SDValue V) {
34112   return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
34113 }
34114
34115 /// If a value is a scalar FP zero or a vector FP zero (potentially including
34116 /// undefined elements), return a zero constant that may be used to fold away
34117 /// that value. In the case of a vector, the returned constant will not contain
34118 /// undefined elements even if the input parameter does. This makes it suitable
34119 /// to be used as a replacement operand with operations (eg, bitwise-and) where
34120 /// an undef should not propagate.
34121 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
34122                                         const X86Subtarget &Subtarget) {
34123   if (!isNullFPScalarOrVectorConst(V))
34124     return SDValue();
34125
34126   if (V.getValueType().isVector())
34127     return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
34128
34129   return V;
34130 }
34131
34132 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
34133                                       const X86Subtarget &Subtarget) {
34134   SDValue N0 = N->getOperand(0);
34135   SDValue N1 = N->getOperand(1);
34136   EVT VT = N->getValueType(0);
34137   SDLoc DL(N);
34138
34139   // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
34140   if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
34141         (VT == MVT::f64 && Subtarget.hasSSE2())))
34142     return SDValue();
34143
34144   auto isAllOnesConstantFP = [](SDValue V) {
34145     auto *C = dyn_cast<ConstantFPSDNode>(V);
34146     return C && C->getConstantFPValue()->isAllOnesValue();
34147   };
34148
34149   // fand (fxor X, -1), Y --> fandn X, Y
34150   if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
34151     return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
34152
34153   // fand X, (fxor Y, -1) --> fandn Y, X
34154   if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
34155     return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
34156
34157   return SDValue();
34158 }
34159
34160 /// Do target-specific dag combines on X86ISD::FAND nodes.
34161 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
34162                            const X86Subtarget &Subtarget) {
34163   // FAND(0.0, x) -> 0.0
34164   if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
34165     return V;
34166
34167   // FAND(x, 0.0) -> 0.0
34168   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34169     return V;
34170
34171   if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
34172     return V;
34173
34174   return lowerX86FPLogicOp(N, DAG, Subtarget);
34175 }
34176
34177 /// Do target-specific dag combines on X86ISD::FANDN nodes.
34178 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
34179                             const X86Subtarget &Subtarget) {
34180   // FANDN(0.0, x) -> x
34181   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34182     return N->getOperand(1);
34183
34184   // FANDN(x, 0.0) -> 0.0
34185   if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34186     return V;
34187
34188   return lowerX86FPLogicOp(N, DAG, Subtarget);
34189 }
34190
34191 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34192 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34193                           const X86Subtarget &Subtarget) {
34194   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
34195
34196   // F[X]OR(0.0, x) -> x
34197   if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34198     return N->getOperand(1);
34199
34200   // F[X]OR(x, 0.0) -> x
34201   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
34202     return N->getOperand(0);
34203
34204   if (isFNEG(N))
34205     if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
34206       return NewVal;
34207
34208   return lowerX86FPLogicOp(N, DAG, Subtarget);
34209 }
34210
34211 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34212 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34213   assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
34214
34215   // Only perform optimizations if UnsafeMath is used.
34216   if (!DAG.getTarget().Options.UnsafeFPMath)
34217     return SDValue();
34218
34219   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34220   // into FMINC and FMAXC, which are Commutative operations.
34221   unsigned NewOp = 0;
34222   switch (N->getOpcode()) {
34223     default: llvm_unreachable("unknown opcode");
34224     case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
34225     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
34226   }
34227
34228   return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34229                      N->getOperand(0), N->getOperand(1));
34230 }
34231
34232 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34233                                      const X86Subtarget &Subtarget) {
34234   if (Subtarget.useSoftFloat())
34235     return SDValue();
34236
34237   // TODO: Check for global or instruction-level "nnan". In that case, we
34238   //       should be able to lower to FMAX/FMIN alone.
34239   // TODO: If an operand is already known to be a NaN or not a NaN, this
34240   //       should be an optional swap and FMAX/FMIN.
34241
34242   EVT VT = N->getValueType(0);
34243   if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
34244         (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
34245         (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
34246     return SDValue();
34247
34248   // This takes at least 3 instructions, so favor a library call when operating
34249   // on a scalar and minimizing code size.
34250   if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
34251     return SDValue();
34252
34253   SDValue Op0 = N->getOperand(0);
34254   SDValue Op1 = N->getOperand(1);
34255   SDLoc DL(N);
34256   EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34257       DAG.getDataLayout(), *DAG.getContext(), VT);
34258
34259   // There are 4 possibilities involving NaN inputs, and these are the required
34260   // outputs:
34261   //                   Op1
34262   //               Num     NaN
34263   //            ----------------
34264   //       Num  |  Max  |  Op0 |
34265   // Op0        ----------------
34266   //       NaN  |  Op1  |  NaN |
34267   //            ----------------
34268   //
34269   // The SSE FP max/min instructions were not designed for this case, but rather
34270   // to implement:
34271   //   Min = Op1 < Op0 ? Op1 : Op0
34272   //   Max = Op1 > Op0 ? Op1 : Op0
34273   //
34274   // So they always return Op0 if either input is a NaN. However, we can still
34275   // use those instructions for fmaxnum by selecting away a NaN input.
34276
34277   // If either operand is NaN, the 2nd source operand (Op0) is passed through.
34278   auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
34279   SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34280   SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34281
34282   // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34283   // are NaN, the NaN value of Op1 is the result.
34284   return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34285 }
34286
34287 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
34288 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34289                             TargetLowering::DAGCombinerInfo &DCI,
34290                             const X86Subtarget &Subtarget) {
34291   // ANDNP(0, x) -> x
34292   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34293     return N->getOperand(1);
34294
34295   // ANDNP(x, 0) -> 0
34296   if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
34297     return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34298
34299   EVT VT = N->getValueType(0);
34300
34301   // Attempt to recursively combine a bitmask ANDNP with shuffles.
34302   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34303     SDValue Op(N, 0);
34304     SmallVector<int, 1> NonceMask; // Just a placeholder.
34305     NonceMask.push_back(0);
34306     if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
34307                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
34308                                       DCI, Subtarget))
34309       return SDValue(); // This routine will use CombineTo to replace N.
34310   }
34311
34312   return SDValue();
34313 }
34314
34315 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34316                          TargetLowering::DAGCombinerInfo &DCI) {
34317   // BT ignores high bits in the bit index operand.
34318   SDValue Op1 = N->getOperand(1);
34319   if (Op1.hasOneUse()) {
34320     unsigned BitWidth = Op1.getValueSizeInBits();
34321     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34322     KnownBits Known;
34323     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34324                                           !DCI.isBeforeLegalizeOps());
34325     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34326     if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
34327         TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
34328       DCI.CommitTargetLoweringOpt(TLO);
34329   }
34330   return SDValue();
34331 }
34332
34333 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34334                                       const X86Subtarget &Subtarget) {
34335   EVT VT = N->getValueType(0);
34336   if (!VT.isVector())
34337     return SDValue();
34338
34339   SDValue N0 = N->getOperand(0);
34340   SDValue N1 = N->getOperand(1);
34341   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34342   SDLoc dl(N);
34343
34344   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34345   // both SSE and AVX2 since there is no sign-extended shift right
34346   // operation on a vector with 64-bit elements.
34347   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34348   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34349   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
34350       N0.getOpcode() == ISD::SIGN_EXTEND)) {
34351     SDValue N00 = N0.getOperand(0);
34352
34353     // EXTLOAD has a better solution on AVX2,
34354     // it may be replaced with X86ISD::VSEXT node.
34355     if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
34356       if (!ISD::isNormalLoad(N00.getNode()))
34357         return SDValue();
34358
34359     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
34360         SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34361                                   N00, N1);
34362       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34363     }
34364   }
34365   return SDValue();
34366 }
34367
34368 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34369 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34370 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34371 /// opportunities to combine math ops, use an LEA, or use a complex addressing
34372 /// mode. This can eliminate extend, add, and shift instructions.
34373 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34374                                    const X86Subtarget &Subtarget) {
34375   if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34376       Ext->getOpcode() != ISD::ZERO_EXTEND)
34377     return SDValue();
34378
34379   // TODO: This should be valid for other integer types.
34380   EVT VT = Ext->getValueType(0);
34381   if (VT != MVT::i64)
34382     return SDValue();
34383
34384   SDValue Add = Ext->getOperand(0);
34385   if (Add.getOpcode() != ISD::ADD)
34386     return SDValue();
34387
34388   bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34389   bool NSW = Add->getFlags().hasNoSignedWrap();
34390   bool NUW = Add->getFlags().hasNoUnsignedWrap();
34391
34392   // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34393   // into the 'zext'
34394   if ((Sext && !NSW) || (!Sext && !NUW))
34395     return SDValue();
34396
34397   // Having a constant operand to the 'add' ensures that we are not increasing
34398   // the instruction count because the constant is extended for free below.
34399   // A constant operand can also become the displacement field of an LEA.
34400   auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34401   if (!AddOp1)
34402     return SDValue();
34403
34404   // Don't make the 'add' bigger if there's no hope of combining it with some
34405   // other 'add' or 'shl' instruction.
34406   // TODO: It may be profitable to generate simpler LEA instructions in place
34407   // of single 'add' instructions, but the cost model for selecting an LEA
34408   // currently has a high threshold.
34409   bool HasLEAPotential = false;
34410   for (auto *User : Ext->uses()) {
34411     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
34412       HasLEAPotential = true;
34413       break;
34414     }
34415   }
34416   if (!HasLEAPotential)
34417     return SDValue();
34418
34419   // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
34420   int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
34421   SDValue AddOp0 = Add.getOperand(0);
34422   SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34423   SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34424
34425   // The wider add is guaranteed to not wrap because both operands are
34426   // sign-extended.
34427   SDNodeFlags Flags;
34428   Flags.setNoSignedWrap(NSW);
34429   Flags.setNoUnsignedWrap(NUW);
34430   return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34431 }
34432
34433 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34434 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34435 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34436 /// extends from AH (which we otherwise need to do contortions to access).
34437 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34438   SDValue N0 = N->getOperand(0);
34439   auto OpcodeN = N->getOpcode();
34440   auto OpcodeN0 = N0.getOpcode();
34441   if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
34442         (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
34443     return SDValue();
34444
34445   EVT VT = N->getValueType(0);
34446   EVT InVT = N0.getValueType();
34447   if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
34448     return SDValue();
34449
34450   SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34451   auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34452                                                : X86ISD::UDIVREM8_ZEXT_HREG;
34453   SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34454                           N0.getOperand(1));
34455   DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34456   return R.getValue(1);
34457 }
34458
34459 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34460 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34461 /// with UNDEFs) of the input to vectors of the same size as the target type
34462 /// which then extends the lowest elements.
34463 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34464                                           TargetLowering::DAGCombinerInfo &DCI,
34465                                           const X86Subtarget &Subtarget) {
34466   unsigned Opcode = N->getOpcode();
34467   if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
34468     return SDValue();
34469   if (!DCI.isBeforeLegalizeOps())
34470     return SDValue();
34471   if (!Subtarget.hasSSE2())
34472     return SDValue();
34473
34474   SDValue N0 = N->getOperand(0);
34475   EVT VT = N->getValueType(0);
34476   EVT SVT = VT.getScalarType();
34477   EVT InVT = N0.getValueType();
34478   EVT InSVT = InVT.getScalarType();
34479
34480   // Input type must be a vector and we must be extending legal integer types.
34481   if (!VT.isVector())
34482     return SDValue();
34483   if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
34484     return SDValue();
34485   if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34486     return SDValue();
34487
34488   // On AVX2+ targets, if the input/output types are both legal then we will be
34489   // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34490   if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34491       DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34492     return SDValue();
34493
34494   SDLoc DL(N);
34495
34496   auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34497     EVT InVT = N.getValueType();
34498     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34499                                  Size / InVT.getScalarSizeInBits());
34500     SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34501                                   DAG.getUNDEF(InVT));
34502     Opnds[0] = N;
34503     return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34504   };
34505
34506   // If target-size is less than 128-bits, extend to a type that would extend
34507   // to 128 bits, extend that and extract the original target vector.
34508   if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34509     unsigned Scale = 128 / VT.getSizeInBits();
34510     EVT ExVT =
34511         EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34512     SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34513     SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34514     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34515                        DAG.getIntPtrConstant(0, DL));
34516   }
34517
34518   // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34519   // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34520   // Also use this if we don't have SSE41 to allow the legalizer do its job.
34521   if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34522       (VT.is256BitVector() && Subtarget.hasInt256()) ||
34523       (VT.is512BitVector() && Subtarget.hasAVX512())) {
34524     SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34525     return Opcode == ISD::SIGN_EXTEND
34526                ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34527                : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34528   }
34529
34530   auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34531     unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34532     unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34533     EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34534     EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34535
34536     SmallVector<SDValue, 8> Opnds;
34537     for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34538       SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34539                                    DAG.getIntPtrConstant(Offset, DL));
34540       SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34541       SrcVec = Opcode == ISD::SIGN_EXTEND
34542                    ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34543                    : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34544       Opnds.push_back(SrcVec);
34545     }
34546     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34547   };
34548
34549   // On pre-AVX2 targets, split into 128-bit nodes of
34550   // ISD::*_EXTEND_VECTOR_INREG.
34551   if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34552     return SplitAndExtendInReg(128);
34553
34554   // On pre-AVX512 targets, split into 256-bit nodes of
34555   // ISD::*_EXTEND_VECTOR_INREG.
34556   if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34557     return SplitAndExtendInReg(256);
34558
34559   return SDValue();
34560 }
34561
34562 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34563                            TargetLowering::DAGCombinerInfo &DCI,
34564                            const X86Subtarget &Subtarget) {
34565   SDValue N0 = N->getOperand(0);
34566   EVT VT = N->getValueType(0);
34567   EVT InVT = N0.getValueType();
34568   SDLoc DL(N);
34569
34570   if (SDValue DivRem8 = getDivRem8(N, DAG))
34571     return DivRem8;
34572
34573   if (!DCI.isBeforeLegalizeOps()) {
34574     if (InVT == MVT::i1) {
34575       SDValue Zero = DAG.getConstant(0, DL, VT);
34576       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34577       return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34578     }
34579     return SDValue();
34580   }
34581
34582   if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34583       isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34584     // Invert and sign-extend a boolean is the same as zero-extend and subtract
34585     // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34586     // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34587     // sext (xor Bool, -1) --> sub (zext Bool), 1
34588     SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34589     return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34590   }
34591
34592   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34593     return V;
34594
34595   if (Subtarget.hasAVX() && VT.is256BitVector())
34596     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34597       return R;
34598
34599   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34600     return NewAdd;
34601
34602   return SDValue();
34603 }
34604
34605 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34606                           const X86Subtarget &Subtarget) {
34607   SDLoc dl(N);
34608   EVT VT = N->getValueType(0);
34609
34610   // Let legalize expand this if it isn't a legal type yet.
34611   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34612     return SDValue();
34613
34614   EVT ScalarVT = VT.getScalarType();
34615   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34616     return SDValue();
34617
34618   SDValue A = N->getOperand(0);
34619   SDValue B = N->getOperand(1);
34620   SDValue C = N->getOperand(2);
34621
34622   auto invertIfNegative = [](SDValue &V) {
34623     if (SDValue NegVal = isFNEG(V.getNode())) {
34624       V = NegVal;
34625       return true;
34626     }
34627     return false;
34628   };
34629
34630   // Do not convert the passthru input of scalar intrinsics.
34631   // FIXME: We could allow negations of the lower element only.
34632   bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34633   bool NegB = invertIfNegative(B);
34634   bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34635
34636   // Negative multiplication when NegA xor NegB
34637   bool NegMul = (NegA != NegB);
34638
34639   unsigned NewOpcode;
34640   if (!NegMul)
34641     NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34642   else
34643     NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34644
34645
34646   if (N->getOpcode() == X86ISD::FMADD_RND) {
34647     switch (NewOpcode) {
34648     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADD_RND; break;
34649     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUB_RND; break;
34650     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34651     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34652     }
34653   } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34654     switch (NewOpcode) {
34655     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS1_RND; break;
34656     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS1_RND; break;
34657     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34658     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34659     }
34660   } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34661     switch (NewOpcode) {
34662     case X86ISD::FMADD:  NewOpcode = X86ISD::FMADDS3_RND; break;
34663     case X86ISD::FMSUB:  NewOpcode = X86ISD::FMSUBS3_RND; break;
34664     case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34665     case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34666     }
34667   } else {
34668     assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34669            "Unexpected opcode!");
34670     return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34671   }
34672
34673   return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34674 }
34675
34676 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34677                            TargetLowering::DAGCombinerInfo &DCI,
34678                            const X86Subtarget &Subtarget) {
34679   // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
34680   //           (and (i32 x86isd::setcc_carry), 1)
34681   // This eliminates the zext. This transformation is necessary because
34682   // ISD::SETCC is always legalized to i8.
34683   SDLoc dl(N);
34684   SDValue N0 = N->getOperand(0);
34685   EVT VT = N->getValueType(0);
34686
34687   if (N0.getOpcode() == ISD::AND &&
34688       N0.hasOneUse() &&
34689       N0.getOperand(0).hasOneUse()) {
34690     SDValue N00 = N0.getOperand(0);
34691     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34692       if (!isOneConstant(N0.getOperand(1)))
34693         return SDValue();
34694       return DAG.getNode(ISD::AND, dl, VT,
34695                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34696                                      N00.getOperand(0), N00.getOperand(1)),
34697                          DAG.getConstant(1, dl, VT));
34698     }
34699   }
34700
34701   if (N0.getOpcode() == ISD::TRUNCATE &&
34702       N0.hasOneUse() &&
34703       N0.getOperand(0).hasOneUse()) {
34704     SDValue N00 = N0.getOperand(0);
34705     if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34706       return DAG.getNode(ISD::AND, dl, VT,
34707                          DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34708                                      N00.getOperand(0), N00.getOperand(1)),
34709                          DAG.getConstant(1, dl, VT));
34710     }
34711   }
34712
34713   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34714     return V;
34715
34716   if (VT.is256BitVector())
34717     if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34718       return R;
34719
34720   if (SDValue DivRem8 = getDivRem8(N, DAG))
34721     return DivRem8;
34722
34723   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34724     return NewAdd;
34725
34726   if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34727     return R;
34728
34729   return SDValue();
34730 }
34731
34732 /// Try to map a 128-bit or larger integer comparison to vector instructions
34733 /// before type legalization splits it up into chunks.
34734 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34735                                                const X86Subtarget &Subtarget) {
34736   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34737   assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34738
34739   // We're looking for an oversized integer equality comparison, but ignore a
34740   // comparison with zero because that gets special treatment in EmitTest().
34741   SDValue X = SetCC->getOperand(0);
34742   SDValue Y = SetCC->getOperand(1);
34743   EVT OpVT = X.getValueType();
34744   unsigned OpSize = OpVT.getSizeInBits();
34745   if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34746     return SDValue();
34747
34748   // Bail out if we know that this is not really just an oversized integer.
34749   if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
34750       peekThroughBitcasts(Y).getValueType() == MVT::f128)
34751     return SDValue();
34752
34753   // TODO: Use PXOR + PTEST for SSE4.1 or later?
34754   // TODO: Add support for AVX-512.
34755   EVT VT = SetCC->getValueType(0);
34756   SDLoc DL(SetCC);
34757   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34758       (OpSize == 256 && Subtarget.hasAVX2())) {
34759     EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34760     SDValue VecX = DAG.getBitcast(VecVT, X);
34761     SDValue VecY = DAG.getBitcast(VecVT, Y);
34762
34763     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34764     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34765     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34766     // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34767     // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34768     SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34769     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34770     SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34771                                     MVT::i32);
34772     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34773   }
34774
34775   return SDValue();
34776 }
34777
34778 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34779                             const X86Subtarget &Subtarget) {
34780   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34781   SDValue LHS = N->getOperand(0);
34782   SDValue RHS = N->getOperand(1);
34783   EVT VT = N->getValueType(0);
34784   SDLoc DL(N);
34785
34786   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34787     EVT OpVT = LHS.getValueType();
34788     // 0-x == y --> x+y == 0
34789     // 0-x != y --> x+y != 0
34790     if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34791         LHS.hasOneUse()) {
34792       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34793       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34794     }
34795     // x == 0-y --> x+y == 0
34796     // x != 0-y --> x+y != 0
34797     if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34798         RHS.hasOneUse()) {
34799       SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34800       return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34801     }
34802
34803     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34804       return V;
34805   }
34806
34807   if (VT.getScalarType() == MVT::i1 &&
34808       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34809     bool IsSEXT0 =
34810         (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34811         (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34812     bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34813
34814     if (!IsSEXT0 || !IsVZero1) {
34815       // Swap the operands and update the condition code.
34816       std::swap(LHS, RHS);
34817       CC = ISD::getSetCCSwappedOperands(CC);
34818
34819       IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34820                 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34821       IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34822     }
34823
34824     if (IsSEXT0 && IsVZero1) {
34825       assert(VT == LHS.getOperand(0).getValueType() &&
34826              "Uexpected operand type");
34827       if (CC == ISD::SETGT)
34828         return DAG.getConstant(0, DL, VT);
34829       if (CC == ISD::SETLE)
34830         return DAG.getConstant(1, DL, VT);
34831       if (CC == ISD::SETEQ || CC == ISD::SETGE)
34832         return DAG.getNOT(DL, LHS.getOperand(0), VT);
34833
34834       assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34835              "Unexpected condition code!");
34836       return LHS.getOperand(0);
34837     }
34838   }
34839
34840   // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34841   // to avoid scalarization via legalization because v4i32 is not a legal type.
34842   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34843       LHS.getValueType() == MVT::v4f32)
34844     return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34845
34846   return SDValue();
34847 }
34848
34849 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34850   SDLoc DL(N);
34851   // Gather and Scatter instructions use k-registers for masks. The type of
34852   // the masks is v*i1. So the mask will be truncated anyway.
34853   // The SIGN_EXTEND_INREG my be dropped.
34854   SDValue Mask = N->getOperand(2);
34855   if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34856     SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34857     NewOps[2] = Mask.getOperand(0);
34858     DAG.UpdateNodeOperands(N, NewOps);
34859   }
34860   return SDValue();
34861 }
34862
34863 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34864 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34865                                const X86Subtarget &Subtarget) {
34866   SDLoc DL(N);
34867   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34868   SDValue EFLAGS = N->getOperand(1);
34869
34870   // Try to simplify the EFLAGS and condition code operands.
34871   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34872     return getSETCC(CC, Flags, DL, DAG);
34873
34874   return SDValue();
34875 }
34876
34877 /// Optimize branch condition evaluation.
34878 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34879                              const X86Subtarget &Subtarget) {
34880   SDLoc DL(N);
34881   SDValue EFLAGS = N->getOperand(3);
34882   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34883
34884   // Try to simplify the EFLAGS and condition code operands.
34885   // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34886   // RAUW them under us.
34887   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34888     SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34889     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34890                        N->getOperand(1), Cond, Flags);
34891   }
34892
34893   return SDValue();
34894 }
34895
34896 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34897                                                   SelectionDAG &DAG) {
34898   // Take advantage of vector comparisons producing 0 or -1 in each lane to
34899   // optimize away operation when it's from a constant.
34900   //
34901   // The general transformation is:
34902   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34903   //       AND(VECTOR_CMP(x,y), constant2)
34904   //    constant2 = UNARYOP(constant)
34905
34906   // Early exit if this isn't a vector operation, the operand of the
34907   // unary operation isn't a bitwise AND, or if the sizes of the operations
34908   // aren't the same.
34909   EVT VT = N->getValueType(0);
34910   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34911       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34912       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34913     return SDValue();
34914
34915   // Now check that the other operand of the AND is a constant. We could
34916   // make the transformation for non-constant splats as well, but it's unclear
34917   // that would be a benefit as it would not eliminate any operations, just
34918   // perform one more step in scalar code before moving to the vector unit.
34919   if (BuildVectorSDNode *BV =
34920           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34921     // Bail out if the vector isn't a constant.
34922     if (!BV->isConstant())
34923       return SDValue();
34924
34925     // Everything checks out. Build up the new and improved node.
34926     SDLoc DL(N);
34927     EVT IntVT = BV->getValueType(0);
34928     // Create a new constant of the appropriate type for the transformed
34929     // DAG.
34930     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34931     // The AND node needs bitcasts to/from an integer vector type around it.
34932     SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34933     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34934                                  N->getOperand(0)->getOperand(0), MaskConst);
34935     SDValue Res = DAG.getBitcast(VT, NewAnd);
34936     return Res;
34937   }
34938
34939   return SDValue();
34940 }
34941
34942 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34943                                const X86Subtarget &Subtarget) {
34944   SDValue Op0 = N->getOperand(0);
34945   EVT VT = N->getValueType(0);
34946   EVT InVT = Op0.getValueType();
34947   EVT InSVT = InVT.getScalarType();
34948   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34949
34950   // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34951   // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34952   if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34953     SDLoc dl(N);
34954     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34955                                  InVT.getVectorNumElements());
34956     SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34957
34958     if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34959       return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34960
34961     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34962   }
34963
34964   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34965   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34966   // the optimization here.
34967   if (DAG.SignBitIsZero(Op0))
34968     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34969
34970   return SDValue();
34971 }
34972
34973 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34974                                const X86Subtarget &Subtarget) {
34975   // First try to optimize away the conversion entirely when it's
34976   // conditionally from a constant. Vectors only.
34977   if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34978     return Res;
34979
34980   // Now move on to more general possibilities.
34981   SDValue Op0 = N->getOperand(0);
34982   EVT VT = N->getValueType(0);
34983   EVT InVT = Op0.getValueType();
34984   EVT InSVT = InVT.getScalarType();
34985
34986   // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34987   // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34988   // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34989   if (InVT.isVector() &&
34990       (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34991        (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34992     SDLoc dl(N);
34993     EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34994                                  InVT.getVectorNumElements());
34995     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34996     return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34997   }
34998
34999   // Without AVX512DQ we only support i64 to float scalar conversion. For both
35000   // vectors and scalars, see if we know that the upper bits are all the sign
35001   // bit, in which case we can truncate the input to i32 and convert from that.
35002   if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
35003     unsigned BitWidth = InVT.getScalarSizeInBits();
35004     unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
35005     if (NumSignBits >= (BitWidth - 31)) {
35006       EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
35007       if (InVT.isVector())
35008         TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
35009                                    InVT.getVectorNumElements());
35010       SDLoc dl(N);
35011       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
35012       return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
35013     }
35014   }
35015
35016   // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
35017   // a 32-bit target where SSE doesn't support i64->FP operations.
35018   if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
35019     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
35020     EVT LdVT = Ld->getValueType(0);
35021
35022     // This transformation is not supported if the result type is f16 or f128.
35023     if (VT == MVT::f16 || VT == MVT::f128)
35024       return SDValue();
35025
35026     if (!Ld->isVolatile() && !VT.isVector() &&
35027         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
35028         !Subtarget.is64Bit() && LdVT == MVT::i64) {
35029       SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
35030           SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
35031       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
35032       return FILDChain;
35033     }
35034   }
35035   return SDValue();
35036 }
35037
35038 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
35039   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
35040     MVT VT = N->getSimpleValueType(0);
35041     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35042     return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
35043                        N->getOperand(0), N->getOperand(1),
35044                        Flags);
35045   }
35046
35047   return SDValue();
35048 }
35049
35050 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
35051 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
35052                           X86TargetLowering::DAGCombinerInfo &DCI) {
35053   // If the LHS and RHS of the ADC node are zero, then it can't overflow and
35054   // the result is either zero or one (depending on the input carry bit).
35055   // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
35056   if (X86::isZeroNode(N->getOperand(0)) &&
35057       X86::isZeroNode(N->getOperand(1)) &&
35058       // We don't have a good way to replace an EFLAGS use, so only do this when
35059       // dead right now.
35060       SDValue(N, 1).use_empty()) {
35061     SDLoc DL(N);
35062     EVT VT = N->getValueType(0);
35063     SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
35064     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
35065                                DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35066                                            DAG.getConstant(X86::COND_B, DL,
35067                                                            MVT::i8),
35068                                            N->getOperand(2)),
35069                                DAG.getConstant(1, DL, VT));
35070     return DCI.CombineTo(N, Res1, CarryOut);
35071   }
35072
35073   if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
35074     MVT VT = N->getSimpleValueType(0);
35075     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35076     return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
35077                        N->getOperand(0), N->getOperand(1),
35078                        Flags);
35079   }
35080
35081   return SDValue();
35082 }
35083
35084 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
35085 /// which is more useful than 0/1 in some cases.
35086 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
35087   SDLoc DL(N);
35088   // "Condition code B" is also known as "the carry flag" (CF).
35089   SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
35090   SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
35091   MVT VT = N->getSimpleValueType(0);
35092   if (VT == MVT::i8)
35093     return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
35094
35095   assert(VT == MVT::i1 && "Unexpected type for SETCC node");
35096   return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
35097 }
35098
35099 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
35100 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
35101 /// with CMP+{ADC, SBB}.
35102 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
35103   bool IsSub = N->getOpcode() == ISD::SUB;
35104   SDValue X = N->getOperand(0);
35105   SDValue Y = N->getOperand(1);
35106
35107   // If this is an add, canonicalize a zext operand to the RHS.
35108   // TODO: Incomplete? What if both sides are zexts?
35109   if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
35110       Y.getOpcode() != ISD::ZERO_EXTEND)
35111     std::swap(X, Y);
35112
35113   // Look through a one-use zext.
35114   bool PeekedThroughZext = false;
35115   if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
35116     Y = Y.getOperand(0);
35117     PeekedThroughZext = true;
35118   }
35119
35120   // If this is an add, canonicalize a setcc operand to the RHS.
35121   // TODO: Incomplete? What if both sides are setcc?
35122   // TODO: Should we allow peeking through a zext of the other operand?
35123   if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
35124       Y.getOpcode() != X86ISD::SETCC)
35125     std::swap(X, Y);
35126
35127   if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
35128     return SDValue();
35129
35130   SDLoc DL(N);
35131   EVT VT = N->getValueType(0);
35132   X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
35133
35134   // If X is -1 or 0, then we have an opportunity to avoid constants required in
35135   // the general case below.
35136   auto *ConstantX = dyn_cast<ConstantSDNode>(X);
35137   if (ConstantX) {
35138     if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
35139         (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
35140       // This is a complicated way to get -1 or 0 from the carry flag:
35141       // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35142       //  0 - SETB  -->  0 -  (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35143       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35144                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
35145                          Y.getOperand(1));
35146     }
35147
35148     if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
35149         (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
35150       SDValue EFLAGS = Y->getOperand(1);
35151       if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35152           EFLAGS.getValueType().isInteger() &&
35153           !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35154         // Swap the operands of a SUB, and we have the same pattern as above.
35155         // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
35156         //  0 - SETA  (SUB A, B) -->  0 - SETB  (SUB B, A) --> SUB + SBB
35157         SDValue NewSub = DAG.getNode(
35158             X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
35159             EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35160         SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35161         return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35162                            DAG.getConstant(X86::COND_B, DL, MVT::i8),
35163                            NewEFLAGS);
35164       }
35165     }
35166   }
35167
35168   if (CC == X86::COND_B) {
35169     // X + SETB Z --> X + (mask SBB Z, Z)
35170     // X - SETB Z --> X - (mask SBB Z, Z)
35171     // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
35172     SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
35173     if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35174       SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35175     return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35176   }
35177
35178   if (CC == X86::COND_A) {
35179     SDValue EFLAGS = Y->getOperand(1);
35180     // Try to convert COND_A into COND_B in an attempt to facilitate
35181     // materializing "setb reg".
35182     //
35183     // Do not flip "e > c", where "c" is a constant, because Cmp instruction
35184     // cannot take an immediate as its first operand.
35185     //
35186     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35187         EFLAGS.getValueType().isInteger() &&
35188         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35189       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
35190                                    EFLAGS.getNode()->getVTList(),
35191                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35192       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35193       SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
35194       if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35195         SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35196       return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35197     }
35198   }
35199
35200   if (CC != X86::COND_E && CC != X86::COND_NE)
35201     return SDValue();
35202
35203   SDValue Cmp = Y.getOperand(1);
35204   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
35205       !X86::isZeroNode(Cmp.getOperand(1)) ||
35206       !Cmp.getOperand(0).getValueType().isInteger())
35207     return SDValue();
35208
35209   SDValue Z = Cmp.getOperand(0);
35210   EVT ZVT = Z.getValueType();
35211
35212   // If X is -1 or 0, then we have an opportunity to avoid constants required in
35213   // the general case below.
35214   if (ConstantX) {
35215     // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35216     // fake operands:
35217     //  0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35218     // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35219     if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
35220         (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
35221       SDValue Zero = DAG.getConstant(0, DL, ZVT);
35222       SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35223       SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35224       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35225                          DAG.getConstant(X86::COND_B, DL, MVT::i8),
35226                          SDValue(Neg.getNode(), 1));
35227     }
35228
35229     // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35230     // with fake operands:
35231     //  0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35232     // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35233     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
35234         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
35235       SDValue One = DAG.getConstant(1, DL, ZVT);
35236       SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35237       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35238                          DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35239     }
35240   }
35241
35242   // (cmp Z, 1) sets the carry flag if Z is 0.
35243   SDValue One = DAG.getConstant(1, DL, ZVT);
35244   SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35245
35246   // Add the flags type for ADC/SBB nodes.
35247   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35248
35249   // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35250   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35251   if (CC == X86::COND_NE)
35252     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
35253                        DAG.getConstant(-1ULL, DL, VT), Cmp1);
35254
35255   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
35256   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
35257   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
35258                      DAG.getConstant(0, DL, VT), Cmp1);
35259 }
35260
35261 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35262                                       const X86Subtarget &Subtarget) {
35263   SDValue MulOp = N->getOperand(0);
35264   SDValue Phi = N->getOperand(1);
35265
35266   if (MulOp.getOpcode() != ISD::MUL)
35267     std::swap(MulOp, Phi);
35268   if (MulOp.getOpcode() != ISD::MUL)
35269     return SDValue();
35270
35271   ShrinkMode Mode;
35272   if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
35273     return SDValue();
35274
35275   EVT VT = N->getValueType(0);
35276
35277   unsigned RegSize = 128;
35278   if (Subtarget.hasBWI())
35279     RegSize = 512;
35280   else if (Subtarget.hasAVX2())
35281     RegSize = 256;
35282   unsigned VectorSize = VT.getVectorNumElements() * 16;
35283   // If the vector size is less than 128, or greater than the supported RegSize,
35284   // do not use PMADD.
35285   if (VectorSize < 128 || VectorSize > RegSize)
35286     return SDValue();
35287
35288   SDLoc DL(N);
35289   EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35290                                    VT.getVectorNumElements());
35291   EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35292                                 VT.getVectorNumElements() / 2);
35293
35294   // Shrink the operands of mul.
35295   SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35296   SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35297
35298   // Madd vector size is half of the original vector size
35299   SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35300   // Fill the rest of the output with 0
35301   SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35302   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35303   return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35304 }
35305
35306 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35307                                      const X86Subtarget &Subtarget) {
35308   SDLoc DL(N);
35309   EVT VT = N->getValueType(0);
35310   SDValue Op0 = N->getOperand(0);
35311   SDValue Op1 = N->getOperand(1);
35312
35313   // TODO: There's nothing special about i32, any integer type above i16 should
35314   // work just as well.
35315   if (!VT.isVector() || !VT.isSimple() ||
35316       !(VT.getVectorElementType() == MVT::i32))
35317     return SDValue();
35318
35319   unsigned RegSize = 128;
35320   if (Subtarget.hasBWI())
35321     RegSize = 512;
35322   else if (Subtarget.hasAVX2())
35323     RegSize = 256;
35324
35325   // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35326   // TODO: We should be able to handle larger vectors by splitting them before
35327   // feeding them into several SADs, and then reducing over those.
35328   if (VT.getSizeInBits() / 4 > RegSize)
35329     return SDValue();
35330
35331   // We know N is a reduction add, which means one of its operands is a phi.
35332   // To match SAD, we need the other operand to be a vector select.
35333   SDValue SelectOp, Phi;
35334   if (Op0.getOpcode() == ISD::VSELECT) {
35335     SelectOp = Op0;
35336     Phi = Op1;
35337   } else if (Op1.getOpcode() == ISD::VSELECT) {
35338     SelectOp = Op1;
35339     Phi = Op0;
35340   } else
35341     return SDValue();
35342
35343   // Check whether we have an abs-diff pattern feeding into the select.
35344   if(!detectZextAbsDiff(SelectOp, Op0, Op1))
35345     return SDValue();
35346
35347   // SAD pattern detected. Now build a SAD instruction and an addition for
35348   // reduction. Note that the number of elements of the result of SAD is less
35349   // than the number of elements of its input. Therefore, we could only update
35350   // part of elements in the reduction vector.
35351   SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35352
35353   // The output of PSADBW is a vector of i64.
35354   // We need to turn the vector of i64 into a vector of i32.
35355   // If the reduction vector is at least as wide as the psadbw result, just
35356   // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35357   // anyway.
35358   MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35359   if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35360     Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35361   else
35362     Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35363
35364   if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
35365     // Update part of elements of the reduction vector. This is done by first
35366     // extracting a sub-vector from it, updating this sub-vector, and inserting
35367     // it back.
35368     SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
35369                                  DAG.getIntPtrConstant(0, DL));
35370     SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
35371     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
35372                        DAG.getIntPtrConstant(0, DL));
35373   } else
35374     return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35375 }
35376
35377 /// Convert vector increment or decrement to sub/add with an all-ones constant:
35378 /// add X, <1, 1...> --> sub X, <-1, -1...>
35379 /// sub X, <1, 1...> --> add X, <-1, -1...>
35380 /// The all-ones vector constant can be materialized using a pcmpeq instruction
35381 /// that is commonly recognized as an idiom (has no register dependency), so
35382 /// that's better/smaller than loading a splat 1 constant.
35383 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35384   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
35385          "Unexpected opcode for increment/decrement transform");
35386
35387   // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35388   // out and wait for legalization if we have an unsupported vector length.
35389   EVT VT = N->getValueType(0);
35390   if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35391     return SDValue();
35392
35393   SDNode *N1 = N->getOperand(1).getNode();
35394   APInt SplatVal;
35395   if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) ||
35396       !SplatVal.isOneValue())
35397     return SDValue();
35398
35399   SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35400   unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35401   return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35402 }
35403
35404 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35405                           const X86Subtarget &Subtarget) {
35406   const SDNodeFlags Flags = N->getFlags();
35407   if (Flags.hasVectorReduction()) {
35408     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35409       return Sad;
35410     if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
35411       return MAdd;
35412   }
35413   EVT VT = N->getValueType(0);
35414   SDValue Op0 = N->getOperand(0);
35415   SDValue Op1 = N->getOperand(1);
35416
35417   // Try to synthesize horizontal adds from adds of shuffles.
35418   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35419        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35420       isHorizontalBinOp(Op0, Op1, true))
35421     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35422
35423   if (SDValue V = combineIncDecVector(N, DAG))
35424     return V;
35425
35426   return combineAddOrSubToADCOrSBB(N, DAG);
35427 }
35428
35429 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35430                           const X86Subtarget &Subtarget) {
35431   SDValue Op0 = N->getOperand(0);
35432   SDValue Op1 = N->getOperand(1);
35433
35434   // X86 can't encode an immediate LHS of a sub. See if we can push the
35435   // negation into a preceding instruction.
35436   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
35437     // If the RHS of the sub is a XOR with one use and a constant, invert the
35438     // immediate. Then add one to the LHS of the sub so we can turn
35439     // X-Y -> X+~Y+1, saving one register.
35440     if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
35441         isa<ConstantSDNode>(Op1.getOperand(1))) {
35442       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35443       EVT VT = Op0.getValueType();
35444       SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35445                                    Op1.getOperand(0),
35446                                    DAG.getConstant(~XorC, SDLoc(Op1), VT));
35447       return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35448                          DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35449     }
35450   }
35451
35452   // Try to synthesize horizontal subs from subs of shuffles.
35453   EVT VT = N->getValueType(0);
35454   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35455        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35456       isHorizontalBinOp(Op0, Op1, false))
35457     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35458
35459   if (SDValue V = combineIncDecVector(N, DAG))
35460     return V;
35461
35462   return combineAddOrSubToADCOrSBB(N, DAG);
35463 }
35464
35465 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35466                              TargetLowering::DAGCombinerInfo &DCI,
35467                              const X86Subtarget &Subtarget) {
35468   if (DCI.isBeforeLegalize())
35469     return SDValue();
35470
35471   SDLoc DL(N);
35472   unsigned Opcode = N->getOpcode();
35473   MVT VT = N->getSimpleValueType(0);
35474   MVT SVT = VT.getVectorElementType();
35475   unsigned NumElts = VT.getVectorNumElements();
35476   unsigned EltSizeInBits = SVT.getSizeInBits();
35477
35478   SDValue Op = N->getOperand(0);
35479   MVT OpVT = Op.getSimpleValueType();
35480   MVT OpEltVT = OpVT.getVectorElementType();
35481   unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35482   unsigned InputBits = OpEltSizeInBits * NumElts;
35483
35484   // Perform any constant folding.
35485   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35486   APInt UndefElts;
35487   SmallVector<APInt, 64> EltBits;
35488   if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
35489     APInt Undefs(NumElts, 0);
35490     SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35491     bool IsZEXT =
35492         (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35493     for (unsigned i = 0; i != NumElts; ++i) {
35494       if (UndefElts[i]) {
35495         Undefs.setBit(i);
35496         continue;
35497       }
35498       Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
35499                        : EltBits[i].sextOrTrunc(EltSizeInBits);
35500     }
35501     return getConstVector(Vals, Undefs, VT, DAG, DL);
35502   }
35503
35504   // (vzext (bitcast (vzext (x)) -> (vzext x)
35505   // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35506   SDValue V = peekThroughBitcasts(Op);
35507   if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
35508     MVT InnerVT = V.getSimpleValueType();
35509     MVT InnerEltVT = InnerVT.getVectorElementType();
35510
35511     // If the element sizes match exactly, we can just do one larger vzext. This
35512     // is always an exact type match as vzext operates on integer types.
35513     if (OpEltVT == InnerEltVT) {
35514       assert(OpVT == InnerVT && "Types must match for vzext!");
35515       return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35516     }
35517
35518     // The only other way we can combine them is if only a single element of the
35519     // inner vzext is used in the input to the outer vzext.
35520     if (InnerEltVT.getSizeInBits() < InputBits)
35521       return SDValue();
35522
35523     // In this case, the inner vzext is completely dead because we're going to
35524     // only look at bits inside of the low element. Just do the outer vzext on
35525     // a bitcast of the input to the inner.
35526     return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35527   }
35528
35529   // Check if we can bypass extracting and re-inserting an element of an input
35530   // vector. Essentially:
35531   // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35532   // TODO: Add X86ISD::VSEXT support
35533   if (Opcode == X86ISD::VZEXT &&
35534       V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35535       V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35536       V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
35537     SDValue ExtractedV = V.getOperand(0);
35538     SDValue OrigV = ExtractedV.getOperand(0);
35539     if (isNullConstant(ExtractedV.getOperand(1))) {
35540         MVT OrigVT = OrigV.getSimpleValueType();
35541         // Extract a subvector if necessary...
35542         if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
35543           int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35544           OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35545                                     OrigVT.getVectorNumElements() / Ratio);
35546           OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35547                               DAG.getIntPtrConstant(0, DL));
35548         }
35549         Op = DAG.getBitcast(OpVT, OrigV);
35550         return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35551       }
35552   }
35553
35554   return SDValue();
35555 }
35556
35557 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35558 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35559                                   const X86Subtarget &Subtarget) {
35560   SDValue Chain = N->getOperand(0);
35561   SDValue LHS = N->getOperand(1);
35562   SDValue RHS = N->getOperand(2);
35563   MVT VT = RHS.getSimpleValueType();
35564   SDLoc DL(N);
35565
35566   auto *C = dyn_cast<ConstantSDNode>(RHS);
35567   if (!C || C->getZExtValue() != 1)
35568     return SDValue();
35569
35570   RHS = DAG.getConstant(-1, DL, VT);
35571   MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35572   return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35573                                  DAG.getVTList(MVT::i32, MVT::Other),
35574                                  {Chain, LHS, RHS}, VT, MMO);
35575 }
35576
35577 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
35578 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
35579   SDValue Op0 = N->getOperand(0);
35580   SDValue Op1 = N->getOperand(1);
35581
35582   if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
35583     return SDValue();
35584
35585   EVT VT = N->getValueType(0);
35586   SDLoc DL(N);
35587
35588   return DAG.getNode(X86ISD::TESTM, DL, VT,
35589                      Op0->getOperand(0), Op0->getOperand(1));
35590 }
35591
35592 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35593                                     const X86Subtarget &Subtarget) {
35594   MVT VT = N->getSimpleValueType(0);
35595   SDLoc DL(N);
35596
35597   if (N->getOperand(0) == N->getOperand(1)) {
35598     if (N->getOpcode() == X86ISD::PCMPEQ)
35599       return getOnesVector(VT, DAG, DL);
35600     if (N->getOpcode() == X86ISD::PCMPGT)
35601       return getZeroVector(VT, Subtarget, DAG, DL);
35602   }
35603
35604   return SDValue();
35605 }
35606
35607 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35608                                       TargetLowering::DAGCombinerInfo &DCI,
35609                                       const X86Subtarget &Subtarget) {
35610   if (DCI.isBeforeLegalizeOps())
35611     return SDValue();
35612
35613   SDLoc dl(N);
35614   SDValue Vec = N->getOperand(0);
35615   SDValue SubVec = N->getOperand(1);
35616   SDValue Idx = N->getOperand(2);
35617
35618   unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35619   MVT OpVT = N->getSimpleValueType(0);
35620   MVT SubVecVT = SubVec.getSimpleValueType();
35621
35622   // If this is an insert of an extract, combine to a shuffle. Don't do this
35623   // if the insert or extract can be represented with a subvector operation.
35624   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35625       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35626       (IdxVal != 0 || !Vec.isUndef())) {
35627     int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35628     if (ExtIdxVal != 0) {
35629       int VecNumElts = OpVT.getVectorNumElements();
35630       int SubVecNumElts = SubVecVT.getVectorNumElements();
35631       SmallVector<int, 64> Mask(VecNumElts);
35632       // First create an identity shuffle mask.
35633       for (int i = 0; i != VecNumElts; ++i)
35634         Mask[i] = i;
35635       // Now insert the extracted portion.
35636       for (int i = 0; i != SubVecNumElts; ++i)
35637         Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35638
35639       return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35640     }
35641   }
35642
35643   // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35644   // load:
35645   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35646   //                   (load16 addr + 16), Elts/2)
35647   // --> load32 addr
35648   // or:
35649   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35650   //                   (load32 addr + 32), Elts/2)
35651   // --> load64 addr
35652   // or a 16-byte or 32-byte broadcast:
35653   // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35654   //                   (load16 addr), Elts/2)
35655   // --> X86SubVBroadcast(load16 addr)
35656   // or:
35657   // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35658   //                   (load32 addr), Elts/2)
35659   // --> X86SubVBroadcast(load32 addr)
35660   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35661       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35662       OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35663     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35664     if (Idx2 && Idx2->getZExtValue() == 0) {
35665       SDValue SubVec2 = Vec.getOperand(1);
35666       // If needed, look through bitcasts to get to the load.
35667       if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35668         bool Fast;
35669         unsigned Alignment = FirstLd->getAlignment();
35670         unsigned AS = FirstLd->getAddressSpace();
35671         const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35672         if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35673                                     OpVT, AS, Alignment, &Fast) && Fast) {
35674           SDValue Ops[] = {SubVec2, SubVec};
35675           if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35676                                                     Subtarget, false))
35677             return Ld;
35678         }
35679       }
35680       // If lower/upper loads are the same and the only users of the load, then
35681       // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35682       if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35683         if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35684             SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35685           return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35686         }
35687       }
35688       // If this is subv_broadcast insert into both halves, use a larger
35689       // subv_broadcast.
35690       if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35691         return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35692                            SubVec.getOperand(0));
35693       }
35694     }
35695   }
35696
35697   return SDValue();
35698 }
35699
35700
35701 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35702                                              DAGCombinerInfo &DCI) const {
35703   SelectionDAG &DAG = DCI.DAG;
35704   switch (N->getOpcode()) {
35705   default: break;
35706   case ISD::EXTRACT_VECTOR_ELT:
35707     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35708   case X86ISD::PEXTRW:
35709   case X86ISD::PEXTRB:
35710     return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35711   case ISD::INSERT_SUBVECTOR:
35712     return combineInsertSubvector(N, DAG, DCI, Subtarget);
35713   case ISD::VSELECT:
35714   case ISD::SELECT:
35715   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35716   case ISD::BITCAST:        return combineBitcast(N, DAG, DCI, Subtarget);
35717   case X86ISD::CMOV:        return combineCMov(N, DAG, DCI, Subtarget);
35718   case ISD::ADD:            return combineAdd(N, DAG, Subtarget);
35719   case ISD::SUB:            return combineSub(N, DAG, Subtarget);
35720   case X86ISD::SBB:         return combineSBB(N, DAG);
35721   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
35722   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
35723   case ISD::SHL:
35724   case ISD::SRA:
35725   case ISD::SRL:            return combineShift(N, DAG, DCI, Subtarget);
35726   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
35727   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
35728   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
35729   case ISD::LOAD:           return combineLoad(N, DAG, DCI, Subtarget);
35730   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
35731   case ISD::STORE:          return combineStore(N, DAG, Subtarget);
35732   case ISD::MSTORE:         return combineMaskedStore(N, DAG, Subtarget);
35733   case ISD::SINT_TO_FP:     return combineSIntToFP(N, DAG, Subtarget);
35734   case ISD::UINT_TO_FP:     return combineUIntToFP(N, DAG, Subtarget);
35735   case ISD::FADD:
35736   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
35737   case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
35738   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
35739   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
35740   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
35741   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
35742   case X86ISD::FXOR:
35743   case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
35744   case X86ISD::FMIN:
35745   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
35746   case ISD::FMINNUM:
35747   case ISD::FMAXNUM:        return combineFMinNumFMaxNum(N, DAG, Subtarget);
35748   case X86ISD::BT:          return combineBT(N, DAG, DCI);
35749   case ISD::ANY_EXTEND:
35750   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
35751   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
35752   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35753   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
35754   case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
35755   case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
35756   case X86ISD::VSHLI:
35757   case X86ISD::VSRAI:
35758   case X86ISD::VSRLI:
35759     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35760   case ISD::SIGN_EXTEND_VECTOR_INREG:
35761   case ISD::ZERO_EXTEND_VECTOR_INREG:
35762   case X86ISD::VSEXT:
35763   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
35764   case X86ISD::PINSRB:
35765   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
35766   case X86ISD::SHUFP:       // Handle all target specific shuffles
35767   case X86ISD::INSERTPS:
35768   case X86ISD::EXTRQI:
35769   case X86ISD::INSERTQI:
35770   case X86ISD::PALIGNR:
35771   case X86ISD::VSHLDQ:
35772   case X86ISD::VSRLDQ:
35773   case X86ISD::BLENDI:
35774   case X86ISD::UNPCKH:
35775   case X86ISD::UNPCKL:
35776   case X86ISD::MOVHLPS:
35777   case X86ISD::MOVLHPS:
35778   case X86ISD::PSHUFB:
35779   case X86ISD::PSHUFD:
35780   case X86ISD::PSHUFHW:
35781   case X86ISD::PSHUFLW:
35782   case X86ISD::MOVSHDUP:
35783   case X86ISD::MOVSLDUP:
35784   case X86ISD::MOVDDUP:
35785   case X86ISD::MOVSS:
35786   case X86ISD::MOVSD:
35787   case X86ISD::VPPERM:
35788   case X86ISD::VPERMI:
35789   case X86ISD::VPERMV:
35790   case X86ISD::VPERMV3:
35791   case X86ISD::VPERMIV3:
35792   case X86ISD::VPERMIL2:
35793   case X86ISD::VPERMILPI:
35794   case X86ISD::VPERMILPV:
35795   case X86ISD::VPERM2X128:
35796   case X86ISD::VZEXT_MOVL:
35797   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35798   case X86ISD::FMADD:
35799   case X86ISD::FMADD_RND:
35800   case X86ISD::FMADDS1_RND:
35801   case X86ISD::FMADDS3_RND:
35802   case ISD::FMA:            return combineFMA(N, DAG, Subtarget);
35803   case ISD::MGATHER:
35804   case ISD::MSCATTER:       return combineGatherScatter(N, DAG);
35805   case X86ISD::LSUB:        return combineLockSub(N, DAG, Subtarget);
35806   case X86ISD::TESTM:       return combineTestM(N, DAG);
35807   case X86ISD::PCMPEQ:
35808   case X86ISD::PCMPGT:      return combineVectorCompare(N, DAG, Subtarget);
35809   }
35810
35811   return SDValue();
35812 }
35813
35814 /// Return true if the target has native support for the specified value type
35815 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35816 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35817 /// some i16 instructions are slow.
35818 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35819   if (!isTypeLegal(VT))
35820     return false;
35821   if (VT != MVT::i16)
35822     return true;
35823
35824   switch (Opc) {
35825   default:
35826     return true;
35827   case ISD::LOAD:
35828   case ISD::SIGN_EXTEND:
35829   case ISD::ZERO_EXTEND:
35830   case ISD::ANY_EXTEND:
35831   case ISD::SHL:
35832   case ISD::SRL:
35833   case ISD::SUB:
35834   case ISD::ADD:
35835   case ISD::MUL:
35836   case ISD::AND:
35837   case ISD::OR:
35838   case ISD::XOR:
35839     return false;
35840   }
35841 }
35842
35843 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35844 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35845 /// we don't adjust the stack we clobber the first frame index.
35846 /// See X86InstrInfo::copyPhysReg.
35847 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35848   const MachineRegisterInfo &MRI = MF.getRegInfo();
35849   return any_of(MRI.reg_instructions(X86::EFLAGS),
35850                 [](const MachineInstr &RI) { return RI.isCopy(); });
35851 }
35852
35853 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35854   if (hasCopyImplyingStackAdjustment(MF)) {
35855     MachineFrameInfo &MFI = MF.getFrameInfo();
35856     MFI.setHasCopyImplyingStackAdjustment(true);
35857   }
35858
35859   TargetLoweringBase::finalizeLowering(MF);
35860 }
35861
35862 /// This method query the target whether it is beneficial for dag combiner to
35863 /// promote the specified node. If true, it should return the desired promotion
35864 /// type by reference.
35865 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35866   EVT VT = Op.getValueType();
35867   if (VT != MVT::i16)
35868     return false;
35869
35870   bool Promote = false;
35871   bool Commute = false;
35872   switch (Op.getOpcode()) {
35873   default: break;
35874   case ISD::SIGN_EXTEND:
35875   case ISD::ZERO_EXTEND:
35876   case ISD::ANY_EXTEND:
35877     Promote = true;
35878     break;
35879   case ISD::SHL:
35880   case ISD::SRL: {
35881     SDValue N0 = Op.getOperand(0);
35882     // Look out for (store (shl (load), x)).
35883     if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35884       return false;
35885     Promote = true;
35886     break;
35887   }
35888   case ISD::ADD:
35889   case ISD::MUL:
35890   case ISD::AND:
35891   case ISD::OR:
35892   case ISD::XOR:
35893     Commute = true;
35894     LLVM_FALLTHROUGH;
35895   case ISD::SUB: {
35896     SDValue N0 = Op.getOperand(0);
35897     SDValue N1 = Op.getOperand(1);
35898     if (!Commute && MayFoldLoad(N1))
35899       return false;
35900     // Avoid disabling potential load folding opportunities.
35901     if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35902       return false;
35903     if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35904       return false;
35905     Promote = true;
35906   }
35907   }
35908
35909   PVT = MVT::i32;
35910   return Promote;
35911 }
35912
35913 //===----------------------------------------------------------------------===//
35914 //                           X86 Inline Assembly Support
35915 //===----------------------------------------------------------------------===//
35916
35917 // Helper to match a string separated by whitespace.
35918 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35919   S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35920
35921   for (StringRef Piece : Pieces) {
35922     if (!S.startswith(Piece)) // Check if the piece matches.
35923       return false;
35924
35925     S = S.substr(Piece.size());
35926     StringRef::size_type Pos = S.find_first_not_of(" \t");
35927     if (Pos == 0) // We matched a prefix.
35928       return false;
35929
35930     S = S.substr(Pos);
35931   }
35932
35933   return S.empty();
35934 }
35935
35936 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35937
35938   if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35939     if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35940         std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35941         std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35942
35943       if (AsmPieces.size() == 3)
35944         return true;
35945       else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35946         return true;
35947     }
35948   }
35949   return false;
35950 }
35951
35952 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35953   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35954
35955   const std::string &AsmStr = IA->getAsmString();
35956
35957   IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35958   if (!Ty || Ty->getBitWidth() % 16 != 0)
35959     return false;
35960
35961   // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35962   SmallVector<StringRef, 4> AsmPieces;
35963   SplitString(AsmStr, AsmPieces, ";\n");
35964
35965   switch (AsmPieces.size()) {
35966   default: return false;
35967   case 1:
35968     // FIXME: this should verify that we are targeting a 486 or better.  If not,
35969     // we will turn this bswap into something that will be lowered to logical
35970     // ops instead of emitting the bswap asm.  For now, we don't support 486 or
35971     // lower so don't worry about this.
35972     // bswap $0
35973     if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35974         matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35975         matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35976         matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35977         matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35978         matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35979       // No need to check constraints, nothing other than the equivalent of
35980       // "=r,0" would be valid here.
35981       return IntrinsicLowering::LowerToByteSwap(CI);
35982     }
35983
35984     // rorw $$8, ${0:w}  -->  llvm.bswap.i16
35985     if (CI->getType()->isIntegerTy(16) &&
35986         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35987         (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35988          matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35989       AsmPieces.clear();
35990       StringRef ConstraintsStr = IA->getConstraintString();
35991       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35992       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35993       if (clobbersFlagRegisters(AsmPieces))
35994         return IntrinsicLowering::LowerToByteSwap(CI);
35995     }
35996     break;
35997   case 3:
35998     if (CI->getType()->isIntegerTy(32) &&
35999         IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
36000         matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
36001         matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
36002         matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
36003       AsmPieces.clear();
36004       StringRef ConstraintsStr = IA->getConstraintString();
36005       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
36006       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
36007       if (clobbersFlagRegisters(AsmPieces))
36008         return IntrinsicLowering::LowerToByteSwap(CI);
36009     }
36010
36011     if (CI->getType()->isIntegerTy(64)) {
36012       InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
36013       if (Constraints.size() >= 2 &&
36014           Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
36015           Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
36016         // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
36017         if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
36018             matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
36019             matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
36020           return IntrinsicLowering::LowerToByteSwap(CI);
36021       }
36022     }
36023     break;
36024   }
36025   return false;
36026 }
36027
36028 /// Given a constraint letter, return the type of constraint for this target.
36029 X86TargetLowering::ConstraintType
36030 X86TargetLowering::getConstraintType(StringRef Constraint) const {
36031   if (Constraint.size() == 1) {
36032     switch (Constraint[0]) {
36033     case 'R':
36034     case 'q':
36035     case 'Q':
36036     case 'f':
36037     case 't':
36038     case 'u':
36039     case 'y':
36040     case 'x':
36041     case 'v':
36042     case 'Y':
36043     case 'l':
36044       return C_RegisterClass;
36045     case 'k': // AVX512 masking registers.
36046     case 'a':
36047     case 'b':
36048     case 'c':
36049     case 'd':
36050     case 'S':
36051     case 'D':
36052     case 'A':
36053       return C_Register;
36054     case 'I':
36055     case 'J':
36056     case 'K':
36057     case 'L':
36058     case 'M':
36059     case 'N':
36060     case 'G':
36061     case 'C':
36062     case 'e':
36063     case 'Z':
36064       return C_Other;
36065     default:
36066       break;
36067     }
36068   }
36069   else if (Constraint.size() == 2) {
36070     switch (Constraint[0]) {
36071     default:
36072       break;
36073     case 'Y':
36074       switch (Constraint[1]) {
36075       default:
36076         break;
36077       case 'k':
36078         return C_Register;
36079       }
36080     }
36081   }
36082   return TargetLowering::getConstraintType(Constraint);
36083 }
36084
36085 /// Examine constraint type and operand type and determine a weight value.
36086 /// This object must already have been set up with the operand type
36087 /// and the current alternative constraint selected.
36088 TargetLowering::ConstraintWeight
36089   X86TargetLowering::getSingleConstraintMatchWeight(
36090     AsmOperandInfo &info, const char *constraint) const {
36091   ConstraintWeight weight = CW_Invalid;
36092   Value *CallOperandVal = info.CallOperandVal;
36093     // If we don't have a value, we can't do a match,
36094     // but allow it at the lowest weight.
36095   if (!CallOperandVal)
36096     return CW_Default;
36097   Type *type = CallOperandVal->getType();
36098   // Look at the constraint type.
36099   switch (*constraint) {
36100   default:
36101     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
36102     LLVM_FALLTHROUGH;
36103   case 'R':
36104   case 'q':
36105   case 'Q':
36106   case 'a':
36107   case 'b':
36108   case 'c':
36109   case 'd':
36110   case 'S':
36111   case 'D':
36112   case 'A':
36113     if (CallOperandVal->getType()->isIntegerTy())
36114       weight = CW_SpecificReg;
36115     break;
36116   case 'f':
36117   case 't':
36118   case 'u':
36119     if (type->isFloatingPointTy())
36120       weight = CW_SpecificReg;
36121     break;
36122   case 'y':
36123     if (type->isX86_MMXTy() && Subtarget.hasMMX())
36124       weight = CW_SpecificReg;
36125     break;
36126   case 'Y':
36127     // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
36128     if (constraint[1] == 'k') {
36129       // Support for 'Yk' (similarly to the 'k' variant below).
36130       weight = CW_SpecificReg;
36131       break;
36132     }
36133   // Else fall through (handle "Y" constraint).
36134     LLVM_FALLTHROUGH;
36135   case 'v':
36136     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
36137       weight = CW_Register;
36138     LLVM_FALLTHROUGH;
36139   case 'x':
36140     if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
36141         ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
36142       weight = CW_Register;
36143     break;
36144   case 'k':
36145     // Enable conditional vector operations using %k<#> registers.
36146     weight = CW_SpecificReg;
36147     break;
36148   case 'I':
36149     if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
36150       if (C->getZExtValue() <= 31)
36151         weight = CW_Constant;
36152     }
36153     break;
36154   case 'J':
36155     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36156       if (C->getZExtValue() <= 63)
36157         weight = CW_Constant;
36158     }
36159     break;
36160   case 'K':
36161     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36162       if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
36163         weight = CW_Constant;
36164     }
36165     break;
36166   case 'L':
36167     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36168       if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
36169         weight = CW_Constant;
36170     }
36171     break;
36172   case 'M':
36173     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36174       if (C->getZExtValue() <= 3)
36175         weight = CW_Constant;
36176     }
36177     break;
36178   case 'N':
36179     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36180       if (C->getZExtValue() <= 0xff)
36181         weight = CW_Constant;
36182     }
36183     break;
36184   case 'G':
36185   case 'C':
36186     if (isa<ConstantFP>(CallOperandVal)) {
36187       weight = CW_Constant;
36188     }
36189     break;
36190   case 'e':
36191     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36192       if ((C->getSExtValue() >= -0x80000000LL) &&
36193           (C->getSExtValue() <= 0x7fffffffLL))
36194         weight = CW_Constant;
36195     }
36196     break;
36197   case 'Z':
36198     if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36199       if (C->getZExtValue() <= 0xffffffff)
36200         weight = CW_Constant;
36201     }
36202     break;
36203   }
36204   return weight;
36205 }
36206
36207 /// Try to replace an X constraint, which matches anything, with another that
36208 /// has more specific requirements based on the type of the corresponding
36209 /// operand.
36210 const char *X86TargetLowering::
36211 LowerXConstraint(EVT ConstraintVT) const {
36212   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
36213   // 'f' like normal targets.
36214   if (ConstraintVT.isFloatingPoint()) {
36215     if (Subtarget.hasSSE2())
36216       return "Y";
36217     if (Subtarget.hasSSE1())
36218       return "x";
36219   }
36220
36221   return TargetLowering::LowerXConstraint(ConstraintVT);
36222 }
36223
36224 /// Lower the specified operand into the Ops vector.
36225 /// If it is invalid, don't add anything to Ops.
36226 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36227                                                      std::string &Constraint,
36228                                                      std::vector<SDValue>&Ops,
36229                                                      SelectionDAG &DAG) const {
36230   SDValue Result;
36231
36232   // Only support length 1 constraints for now.
36233   if (Constraint.length() > 1) return;
36234
36235   char ConstraintLetter = Constraint[0];
36236   switch (ConstraintLetter) {
36237   default: break;
36238   case 'I':
36239     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36240       if (C->getZExtValue() <= 31) {
36241         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36242                                        Op.getValueType());
36243         break;
36244       }
36245     }
36246     return;
36247   case 'J':
36248     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36249       if (C->getZExtValue() <= 63) {
36250         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36251                                        Op.getValueType());
36252         break;
36253       }
36254     }
36255     return;
36256   case 'K':
36257     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36258       if (isInt<8>(C->getSExtValue())) {
36259         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36260                                        Op.getValueType());
36261         break;
36262       }
36263     }
36264     return;
36265   case 'L':
36266     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36267       if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
36268           (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
36269         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36270                                        Op.getValueType());
36271         break;
36272       }
36273     }
36274     return;
36275   case 'M':
36276     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36277       if (C->getZExtValue() <= 3) {
36278         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36279                                        Op.getValueType());
36280         break;
36281       }
36282     }
36283     return;
36284   case 'N':
36285     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36286       if (C->getZExtValue() <= 255) {
36287         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36288                                        Op.getValueType());
36289         break;
36290       }
36291     }
36292     return;
36293   case 'O':
36294     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36295       if (C->getZExtValue() <= 127) {
36296         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36297                                        Op.getValueType());
36298         break;
36299       }
36300     }
36301     return;
36302   case 'e': {
36303     // 32-bit signed value
36304     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36305       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36306                                            C->getSExtValue())) {
36307         // Widen to 64 bits here to get it sign extended.
36308         Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36309         break;
36310       }
36311     // FIXME gcc accepts some relocatable values here too, but only in certain
36312     // memory models; it's complicated.
36313     }
36314     return;
36315   }
36316   case 'Z': {
36317     // 32-bit unsigned value
36318     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36319       if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36320                                            C->getZExtValue())) {
36321         Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36322                                        Op.getValueType());
36323         break;
36324       }
36325     }
36326     // FIXME gcc accepts some relocatable values here too, but only in certain
36327     // memory models; it's complicated.
36328     return;
36329   }
36330   case 'i': {
36331     // Literal immediates are always ok.
36332     if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
36333       // Widen to 64 bits here to get it sign extended.
36334       Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36335       break;
36336     }
36337
36338     // In any sort of PIC mode addresses need to be computed at runtime by
36339     // adding in a register or some sort of table lookup.  These can't
36340     // be used as immediates.
36341     if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
36342       return;
36343
36344     // If we are in non-pic codegen mode, we allow the address of a global (with
36345     // an optional displacement) to be used with 'i'.
36346     GlobalAddressSDNode *GA = nullptr;
36347     int64_t Offset = 0;
36348
36349     // Match either (GA), (GA+C), (GA+C1+C2), etc.
36350     while (1) {
36351       if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
36352         Offset += GA->getOffset();
36353         break;
36354       } else if (Op.getOpcode() == ISD::ADD) {
36355         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36356           Offset += C->getZExtValue();
36357           Op = Op.getOperand(0);
36358           continue;
36359         }
36360       } else if (Op.getOpcode() == ISD::SUB) {
36361         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36362           Offset += -C->getZExtValue();
36363           Op = Op.getOperand(0);
36364           continue;
36365         }
36366       }
36367
36368       // Otherwise, this isn't something we can handle, reject it.
36369       return;
36370     }
36371
36372     const GlobalValue *GV = GA->getGlobal();
36373     // If we require an extra load to get this address, as in PIC mode, we
36374     // can't accept it.
36375     if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36376       return;
36377
36378     Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36379                                         GA->getValueType(0), Offset);
36380     break;
36381   }
36382   }
36383
36384   if (Result.getNode()) {
36385     Ops.push_back(Result);
36386     return;
36387   }
36388   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36389 }
36390
36391 /// Check if \p RC is a general purpose register class.
36392 /// I.e., GR* or one of their variant.
36393 static bool isGRClass(const TargetRegisterClass &RC) {
36394   return RC.hasSuperClassEq(&X86::GR8RegClass) ||
36395          RC.hasSuperClassEq(&X86::GR16RegClass) ||
36396          RC.hasSuperClassEq(&X86::GR32RegClass) ||
36397          RC.hasSuperClassEq(&X86::GR64RegClass) ||
36398          RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36399 }
36400
36401 /// Check if \p RC is a vector register class.
36402 /// I.e., FR* / VR* or one of their variant.
36403 static bool isFRClass(const TargetRegisterClass &RC) {
36404   return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
36405          RC.hasSuperClassEq(&X86::FR64XRegClass) ||
36406          RC.hasSuperClassEq(&X86::VR128XRegClass) ||
36407          RC.hasSuperClassEq(&X86::VR256XRegClass) ||
36408          RC.hasSuperClassEq(&X86::VR512RegClass);
36409 }
36410
36411 std::pair<unsigned, const TargetRegisterClass *>
36412 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36413                                                 StringRef Constraint,
36414                                                 MVT VT) const {
36415   // First, see if this is a constraint that directly corresponds to an LLVM
36416   // register class.
36417   if (Constraint.size() == 1) {
36418     // GCC Constraint Letters
36419     switch (Constraint[0]) {
36420     default: break;
36421       // TODO: Slight differences here in allocation order and leaving
36422       // RIP in the class. Do they matter any more here than they do
36423       // in the normal allocation?
36424     case 'k':
36425       if (Subtarget.hasAVX512()) {
36426         //  Only supported in AVX512 or later.
36427         switch (VT.SimpleTy) {
36428         default: break;
36429         case MVT::i32:
36430           return std::make_pair(0U, &X86::VK32RegClass);
36431         case MVT::i16:
36432           return std::make_pair(0U, &X86::VK16RegClass);
36433         case MVT::i8:
36434           return std::make_pair(0U, &X86::VK8RegClass);
36435         case MVT::i1:
36436           return std::make_pair(0U, &X86::VK1RegClass);
36437         case MVT::i64:
36438           return std::make_pair(0U, &X86::VK64RegClass);
36439         }
36440       }
36441       break;
36442     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36443       if (Subtarget.is64Bit()) {
36444         if (VT == MVT::i32 || VT == MVT::f32)
36445           return std::make_pair(0U, &X86::GR32RegClass);
36446         if (VT == MVT::i16)
36447           return std::make_pair(0U, &X86::GR16RegClass);
36448         if (VT == MVT::i8 || VT == MVT::i1)
36449           return std::make_pair(0U, &X86::GR8RegClass);
36450         if (VT == MVT::i64 || VT == MVT::f64)
36451           return std::make_pair(0U, &X86::GR64RegClass);
36452         break;
36453       }
36454       LLVM_FALLTHROUGH;
36455       // 32-bit fallthrough
36456     case 'Q':   // Q_REGS
36457       if (VT == MVT::i32 || VT == MVT::f32)
36458         return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36459       if (VT == MVT::i16)
36460         return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36461       if (VT == MVT::i8 || VT == MVT::i1)
36462         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36463       if (VT == MVT::i64)
36464         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36465       break;
36466     case 'r':   // GENERAL_REGS
36467     case 'l':   // INDEX_REGS
36468       if (VT == MVT::i8 || VT == MVT::i1)
36469         return std::make_pair(0U, &X86::GR8RegClass);
36470       if (VT == MVT::i16)
36471         return std::make_pair(0U, &X86::GR16RegClass);
36472       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
36473         return std::make_pair(0U, &X86::GR32RegClass);
36474       return std::make_pair(0U, &X86::GR64RegClass);
36475     case 'R':   // LEGACY_REGS
36476       if (VT == MVT::i8 || VT == MVT::i1)
36477         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36478       if (VT == MVT::i16)
36479         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36480       if (VT == MVT::i32 || !Subtarget.is64Bit())
36481         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36482       return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36483     case 'f':  // FP Stack registers.
36484       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
36485       // value to the correct fpstack register class.
36486       if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
36487         return std::make_pair(0U, &X86::RFP32RegClass);
36488       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
36489         return std::make_pair(0U, &X86::RFP64RegClass);
36490       return std::make_pair(0U, &X86::RFP80RegClass);
36491     case 'y':   // MMX_REGS if MMX allowed.
36492       if (!Subtarget.hasMMX()) break;
36493       return std::make_pair(0U, &X86::VR64RegClass);
36494     case 'Y':   // SSE_REGS if SSE2 allowed
36495       if (!Subtarget.hasSSE2()) break;
36496       LLVM_FALLTHROUGH;
36497     case 'v':
36498     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36499       if (!Subtarget.hasSSE1()) break;
36500       bool VConstraint = (Constraint[0] == 'v');
36501
36502       switch (VT.SimpleTy) {
36503       default: break;
36504       // Scalar SSE types.
36505       case MVT::f32:
36506       case MVT::i32:
36507         if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
36508           return std::make_pair(0U, &X86::FR32XRegClass);
36509         return std::make_pair(0U, &X86::FR32RegClass);
36510       case MVT::f64:
36511       case MVT::i64:
36512         if (VConstraint && Subtarget.hasVLX())
36513           return std::make_pair(0U, &X86::FR64XRegClass);
36514         return std::make_pair(0U, &X86::FR64RegClass);
36515       // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36516       // Vector types.
36517       case MVT::v16i8:
36518       case MVT::v8i16:
36519       case MVT::v4i32:
36520       case MVT::v2i64:
36521       case MVT::v4f32:
36522       case MVT::v2f64:
36523         if (VConstraint && Subtarget.hasVLX())
36524           return std::make_pair(0U, &X86::VR128XRegClass);
36525         return std::make_pair(0U, &X86::VR128RegClass);
36526       // AVX types.
36527       case MVT::v32i8:
36528       case MVT::v16i16:
36529       case MVT::v8i32:
36530       case MVT::v4i64:
36531       case MVT::v8f32:
36532       case MVT::v4f64:
36533         if (VConstraint && Subtarget.hasVLX())
36534           return std::make_pair(0U, &X86::VR256XRegClass);
36535         return std::make_pair(0U, &X86::VR256RegClass);
36536       case MVT::v8f64:
36537       case MVT::v16f32:
36538       case MVT::v16i32:
36539       case MVT::v8i64:
36540         return std::make_pair(0U, &X86::VR512RegClass);
36541       }
36542       break;
36543     }
36544   } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
36545     switch (Constraint[1]) {
36546     default:
36547       break;
36548     case 'k':
36549       // This register class doesn't allocate k0 for masked vector operation.
36550       if (Subtarget.hasAVX512()) { // Only supported in AVX512.
36551         switch (VT.SimpleTy) {
36552         default: break;
36553         case MVT::i32:
36554           return std::make_pair(0U, &X86::VK32WMRegClass);
36555         case MVT::i16:
36556           return std::make_pair(0U, &X86::VK16WMRegClass);
36557         case MVT::i8:
36558           return std::make_pair(0U, &X86::VK8WMRegClass);
36559         case MVT::i1:
36560           return std::make_pair(0U, &X86::VK1WMRegClass);
36561         case MVT::i64:
36562           return std::make_pair(0U, &X86::VK64WMRegClass);
36563         }
36564       }
36565       break;
36566     }
36567   }
36568
36569   // Use the default implementation in TargetLowering to convert the register
36570   // constraint into a member of a register class.
36571   std::pair<unsigned, const TargetRegisterClass*> Res;
36572   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36573
36574   // Not found as a standard register?
36575   if (!Res.second) {
36576     // Map st(0) -> st(7) -> ST0
36577     if (Constraint.size() == 7 && Constraint[0] == '{' &&
36578         tolower(Constraint[1]) == 's' &&
36579         tolower(Constraint[2]) == 't' &&
36580         Constraint[3] == '(' &&
36581         (Constraint[4] >= '0' && Constraint[4] <= '7') &&
36582         Constraint[5] == ')' &&
36583         Constraint[6] == '}') {
36584
36585       Res.first = X86::FP0+Constraint[4]-'0';
36586       Res.second = &X86::RFP80RegClass;
36587       return Res;
36588     }
36589
36590     // GCC allows "st(0)" to be called just plain "st".
36591     if (StringRef("{st}").equals_lower(Constraint)) {
36592       Res.first = X86::FP0;
36593       Res.second = &X86::RFP80RegClass;
36594       return Res;
36595     }
36596
36597     // flags -> EFLAGS
36598     if (StringRef("{flags}").equals_lower(Constraint)) {
36599       Res.first = X86::EFLAGS;
36600       Res.second = &X86::CCRRegClass;
36601       return Res;
36602     }
36603
36604     // 'A' means [ER]AX + [ER]DX.
36605     if (Constraint == "A") {
36606       if (Subtarget.is64Bit()) {
36607         Res.first = X86::RAX;
36608         Res.second = &X86::GR64_ADRegClass;
36609       } else {
36610         assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36611                "Expecting 64, 32 or 16 bit subtarget");
36612         Res.first = X86::EAX;
36613         Res.second = &X86::GR32_ADRegClass;
36614       }
36615       return Res;
36616     }
36617     return Res;
36618   }
36619
36620   // Otherwise, check to see if this is a register class of the wrong value
36621   // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36622   // turn into {ax},{dx}.
36623   // MVT::Other is used to specify clobber names.
36624   if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36625     return Res;   // Correct type already, nothing to do.
36626
36627   // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36628   // return "eax". This should even work for things like getting 64bit integer
36629   // registers when given an f64 type.
36630   const TargetRegisterClass *Class = Res.second;
36631   // The generic code will match the first register class that contains the
36632   // given register. Thus, based on the ordering of the tablegened file,
36633   // the "plain" GR classes might not come first.
36634   // Therefore, use a helper method.
36635   if (isGRClass(*Class)) {
36636     unsigned Size = VT.getSizeInBits();
36637     if (Size == 1) Size = 8;
36638     unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36639     if (DestReg > 0) {
36640       Res.first = DestReg;
36641       Res.second = Size == 8 ? &X86::GR8RegClass
36642                  : Size == 16 ? &X86::GR16RegClass
36643                  : Size == 32 ? &X86::GR32RegClass
36644                  : &X86::GR64RegClass;
36645       assert(Res.second->contains(Res.first) && "Register in register class");
36646     } else {
36647       // No register found/type mismatch.
36648       Res.first = 0;
36649       Res.second = nullptr;
36650     }
36651   } else if (isFRClass(*Class)) {
36652     // Handle references to XMM physical registers that got mapped into the
36653     // wrong class.  This can happen with constraints like {xmm0} where the
36654     // target independent register mapper will just pick the first match it can
36655     // find, ignoring the required type.
36656
36657     // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36658     if (VT == MVT::f32 || VT == MVT::i32)
36659       Res.second = &X86::FR32RegClass;
36660     else if (VT == MVT::f64 || VT == MVT::i64)
36661       Res.second = &X86::FR64RegClass;
36662     else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36663       Res.second = &X86::VR128RegClass;
36664     else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36665       Res.second = &X86::VR256RegClass;
36666     else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36667       Res.second = &X86::VR512RegClass;
36668     else {
36669       // Type mismatch and not a clobber: Return an error;
36670       Res.first = 0;
36671       Res.second = nullptr;
36672     }
36673   }
36674
36675   return Res;
36676 }
36677
36678 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36679                                             const AddrMode &AM, Type *Ty,
36680                                             unsigned AS) const {
36681   // Scaling factors are not free at all.
36682   // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36683   // will take 2 allocations in the out of order engine instead of 1
36684   // for plain addressing mode, i.e. inst (reg1).
36685   // E.g.,
36686   // vaddps (%rsi,%drx), %ymm0, %ymm1
36687   // Requires two allocations (one for the load, one for the computation)
36688   // whereas:
36689   // vaddps (%rsi), %ymm0, %ymm1
36690   // Requires just 1 allocation, i.e., freeing allocations for other operations
36691   // and having less micro operations to execute.
36692   //
36693   // For some X86 architectures, this is even worse because for instance for
36694   // stores, the complex addressing mode forces the instruction to use the
36695   // "load" ports instead of the dedicated "store" port.
36696   // E.g., on Haswell:
36697   // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36698   // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36699   if (isLegalAddressingMode(DL, AM, Ty, AS))
36700     // Scale represents reg2 * scale, thus account for 1
36701     // as soon as we use a second register.
36702     return AM.Scale != 0;
36703   return -1;
36704 }
36705
36706 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36707   // Integer division on x86 is expensive. However, when aggressively optimizing
36708   // for code size, we prefer to use a div instruction, as it is usually smaller
36709   // than the alternative sequence.
36710   // The exception to this is vector division. Since x86 doesn't have vector
36711   // integer division, leaving the division as-is is a loss even in terms of
36712   // size, because it will have to be scalarized, while the alternative code
36713   // sequence can be performed in vector form.
36714   bool OptSize =
36715       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36716   return OptSize && !VT.isVector();
36717 }
36718
36719 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36720   if (!Subtarget.is64Bit())
36721     return;
36722
36723   // Update IsSplitCSR in X86MachineFunctionInfo.
36724   X86MachineFunctionInfo *AFI =
36725     Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36726   AFI->setIsSplitCSR(true);
36727 }
36728
36729 void X86TargetLowering::insertCopiesSplitCSR(
36730     MachineBasicBlock *Entry,
36731     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36732   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36733   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36734   if (!IStart)
36735     return;
36736
36737   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36738   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36739   MachineBasicBlock::iterator MBBI = Entry->begin();
36740   for (const MCPhysReg *I = IStart; *I; ++I) {
36741     const TargetRegisterClass *RC = nullptr;
36742     if (X86::GR64RegClass.contains(*I))
36743       RC = &X86::GR64RegClass;
36744     else
36745       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36746
36747     unsigned NewVR = MRI->createVirtualRegister(RC);
36748     // Create copy from CSR to a virtual register.
36749     // FIXME: this currently does not emit CFI pseudo-instructions, it works
36750     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36751     // nounwind. If we want to generalize this later, we may need to emit
36752     // CFI pseudo-instructions.
36753     assert(Entry->getParent()->getFunction()->hasFnAttribute(
36754                Attribute::NoUnwind) &&
36755            "Function should be nounwind in insertCopiesSplitCSR!");
36756     Entry->addLiveIn(*I);
36757     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36758         .addReg(*I);
36759
36760     // Insert the copy-back instructions right before the terminator.
36761     for (auto *Exit : Exits)
36762       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36763               TII->get(TargetOpcode::COPY), *I)
36764           .addReg(NewVR);
36765   }
36766 }
36767
36768 bool X86TargetLowering::supportSwiftError() const {
36769   return Subtarget.is64Bit();
36770 }
36771
36772 /// Returns the name of the symbol used to emit stack probes or the empty
36773 /// string if not applicable.
36774 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
36775   // If the function specifically requests stack probes, emit them.
36776   if (MF.getFunction()->hasFnAttribute("probe-stack"))
36777     return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
36778
36779   // Generally, if we aren't on Windows, the platform ABI does not include
36780   // support for stack probes, so don't emit them.
36781   if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
36782     return "";
36783
36784   // We need a stack probe to conform to the Windows ABI. Choose the right
36785   // symbol.
36786   if (Subtarget.is64Bit())
36787     return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
36788   return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
36789 }